예제 #1
0
    def load(self):
        #print "INFO: Loading dosage files"
        #logging.info("Loading %s dosage", self.path)
        class PrediXcanCollector(object):
            def __init__(self, snps=[], snps_by_rsid={}, weight_db_logic=None):
                self.snps = snps
                self.snps_by_rsid = snps_by_rsid
                self.weight_db_logic = weight_db_logic

            def __call__(self, i, components):
                rsid = components[PDTF.RSID]
                if self.weight_db_logic and not rsid in self.weight_db_logic.genes_for_an_rsid:
                    logging.log(5, "rsid %s not in weight db, skip it", rsid)
                    return

                position = components[PDTF.POSITION]

                ref_allele = components[PDTF.ALLELE_0]
                if not ref_allele in Utilities.VALID_ALLELES:
                    logging.log(9, "wrong ref allele, rsid %s is not an SNP", rsid)
                    return
                eff_allele = components[PDTF.ALLELE_1]
                if not eff_allele in Utilities.VALID_ALLELES:
                    logging.log(9, "wrong eff allele, rsid %s is not an SNP", rsid)
                    return
                dosages = map(float,components[PDTF.FIRST_DATA_COLUMN:]) #dosages may be inputed
                #Should we flip based on weight_db at this point?

                snp = DataSetSNP.DataSetSNP(name=rsid, index=i, data=dosages, position=int(position), ref_allele=ref_allele, eff_allele=eff_allele)
                if snp.name in self.snps_by_rsid:
                    old = self.snps_by_rsid[snp.name]
                    logging.info("Duplicated rsid: (%s,%s) %s", old.name, old.position, " ".join(components))
                self.snps.append(snp)
                self.snps_by_rsid[snp.name] = snp
        loader = Utilities.CSVFileIterator(self.path, compressed=True)
        collector = PrediXcanCollector(weight_db_logic=self.weight_db_logic)
        loader.iterate(collector)
        return collector.snps, collector.snps_by_rsid
예제 #2
0
    def testCSVFileIterator(self):
        class DummyCallback():
            def __init__(self):
                self.lines = []

            def __call__(self, i, row):
                self.lines.append((i, row))

        c = DummyCallback()
        f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample",
                                      header="a")
        with self.assertRaises(Exceptions.MalformedInputFile):
            f.iterate(c)

        c = DummyCallback()
        f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample")
        f.iterate(c)
        self.assertEqual(c.lines, [(0, ["ID", "POP", "GROUP", "SEX"]),
                                   (1, ["ID1", "K", "HERO", "male"]),
                                   (2, ["ID2", "K", "HERO", "female"]),
                                   (3, ["DI5", "K", "HERO", "male"]),
                                   (4, ["ID3", "K", "HERO", "female"]),
                                   (5, ["B1", "L", "T", "female"])])

        c = DummyCallback()
        f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample", "")
        f.iterate(c)
        self.assertEqual(c.lines, [(0, ["ID1", "K", "HERO", "male"]),
                                   (1, ["ID2", "K", "HERO", "female"]),
                                   (2, ["DI5", "K", "HERO", "male"]),
                                   (3, ["ID3", "K", "HERO", "female"]),
                                   (4, ["B1", "L", "T", "female"])])

        c = DummyCallback()
        f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample",
                                      "DI5 K",
                                      ignore_until_header=True)
        f.iterate(c)
        self.assertEqual(c.lines, [(0, ["ID3", "K", "HERO", "female"]),
                                   (1, ["B1", "L", "T", "female"])])

        c = DummyCallback()
        f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample",
                                      header="ID POP GROUP SEX")
        f.iterate(c)
        self.assertEqual(c.lines, [(0, ["ID1", "K", "HERO", "male"]),
                                   (1, ["ID2", "K", "HERO", "female"]),
                                   (2, ["DI5", "K", "HERO", "male"]),
                                   (3, ["ID3", "K", "HERO", "female"]),
                                   (4, ["B1", "L", "T", "female"])])

        c = DummyCallback()
        f = Utilities.CSVFileIterator(
            "tests/_td/dosage_set_1/set_chr1.legend.gz",
            header="a",
            compressed=True)
        with self.assertRaises(Exceptions.MalformedInputFile):
            f.iterate(c)

        c = DummyCallback()
        f = Utilities.CSVFileIterator(
            "tests/_td/dosage_set_1/set_chr1.legend.gz", compressed=True)
        f.iterate(c)
        self.assertEqual(
            c.lines,
            [(0, [
                "id", "position", "a0", "a1", "TYPE", "AFR", "AMR", "EAS",
                "EUR", "SAS", "ALL"
            ]),
             (1, [
                 "1:10177:A:AC", "10177", "A", "AC", "Biallelic_INDEL",
                 "0.490922844175492", "0.360230547550432", "0.336309523809524",
                 "0.405566600397614", "0.494887525562372", "0.425319488817891"
             ]),
             (2, [
                 "rs1:1:A:T", "10505", "A", "T", "Biallelic_SNP", "0", "0",
                 "0", "0", "0", "0"
             ]),
             (3, [
                 "1:12:C:G", "10506", "C", "G", "Biallelic_SNP", "0", "0", "0",
                 "0", "0", "0"
             ]),
             (4, [
                 "rs2:2:G:A", "10511", "G", "A", "Biallelic_SNP", "0", "0",
                 "0", "0", "0", "0"
             ]),
             (5, [
                 "rs3:3:C:T", "10511", "G", "A", "Biallelic_SNP", "0", "0",
                 "0", "0", "0", "0"
             ]),
             (6, [
                 "rs4:4:C:T", "10511", "G", "A", "Biallelic_SNP", "0", "0",
                 "0", "0", "0", "0"
             ])])

        c = DummyCallback()
        f = Utilities.CSVFileIterator(
            "tests/_td/dosage_set_1/set_chr1.legend.gz",
            header="id position a0 a1 TYPE AFR AMR EAS EUR SAS ALL",
            compressed=True)
        f.iterate(c)
        self.assertEqual(
            c.lines, [(0, [
                "1:10177:A:AC", "10177", "A", "AC", "Biallelic_INDEL",
                "0.490922844175492", "0.360230547550432", "0.336309523809524",
                "0.405566600397614", "0.494887525562372", "0.425319488817891"
            ]),
                      (1, [
                          "rs1:1:A:T", "10505", "A", "T", "Biallelic_SNP", "0",
                          "0", "0", "0", "0", "0"
                      ]),
                      (2, [
                          "1:12:C:G", "10506", "C", "G", "Biallelic_SNP", "0",
                          "0", "0", "0", "0", "0"
                      ]),
                      (3, [
                          "rs2:2:G:A", "10511", "G", "A", "Biallelic_SNP", "0",
                          "0", "0", "0", "0", "0"
                      ]),
                      (4, [
                          "rs3:3:C:T", "10511", "G", "A", "Biallelic_SNP", "0",
                          "0", "0", "0", "0", "0"
                      ]),
                      (5, [
                          "rs4:4:C:T", "10511", "G", "A", "Biallelic_SNP", "0",
                          "0", "0", "0", "0", "0"
                      ])])