def load(self): #print "INFO: Loading dosage files" #logging.info("Loading %s dosage", self.path) class PrediXcanCollector(object): def __init__(self, snps=[], snps_by_rsid={}, weight_db_logic=None): self.snps = snps self.snps_by_rsid = snps_by_rsid self.weight_db_logic = weight_db_logic def __call__(self, i, components): rsid = components[PDTF.RSID] if self.weight_db_logic and not rsid in self.weight_db_logic.genes_for_an_rsid: logging.log(5, "rsid %s not in weight db, skip it", rsid) return position = components[PDTF.POSITION] ref_allele = components[PDTF.ALLELE_0] if not ref_allele in Utilities.VALID_ALLELES: logging.log(9, "wrong ref allele, rsid %s is not an SNP", rsid) return eff_allele = components[PDTF.ALLELE_1] if not eff_allele in Utilities.VALID_ALLELES: logging.log(9, "wrong eff allele, rsid %s is not an SNP", rsid) return dosages = map(float,components[PDTF.FIRST_DATA_COLUMN:]) #dosages may be inputed #Should we flip based on weight_db at this point? snp = DataSetSNP.DataSetSNP(name=rsid, index=i, data=dosages, position=int(position), ref_allele=ref_allele, eff_allele=eff_allele) if snp.name in self.snps_by_rsid: old = self.snps_by_rsid[snp.name] logging.info("Duplicated rsid: (%s,%s) %s", old.name, old.position, " ".join(components)) self.snps.append(snp) self.snps_by_rsid[snp.name] = snp loader = Utilities.CSVFileIterator(self.path, compressed=True) collector = PrediXcanCollector(weight_db_logic=self.weight_db_logic) loader.iterate(collector) return collector.snps, collector.snps_by_rsid
def testCSVFileIterator(self): class DummyCallback(): def __init__(self): self.lines = [] def __call__(self, i, row): self.lines.append((i, row)) c = DummyCallback() f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample", header="a") with self.assertRaises(Exceptions.MalformedInputFile): f.iterate(c) c = DummyCallback() f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample") f.iterate(c) self.assertEqual(c.lines, [(0, ["ID", "POP", "GROUP", "SEX"]), (1, ["ID1", "K", "HERO", "male"]), (2, ["ID2", "K", "HERO", "female"]), (3, ["DI5", "K", "HERO", "male"]), (4, ["ID3", "K", "HERO", "female"]), (5, ["B1", "L", "T", "female"])]) c = DummyCallback() f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample", "") f.iterate(c) self.assertEqual(c.lines, [(0, ["ID1", "K", "HERO", "male"]), (1, ["ID2", "K", "HERO", "female"]), (2, ["DI5", "K", "HERO", "male"]), (3, ["ID3", "K", "HERO", "female"]), (4, ["B1", "L", "T", "female"])]) c = DummyCallback() f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample", "DI5 K", ignore_until_header=True) f.iterate(c) self.assertEqual(c.lines, [(0, ["ID3", "K", "HERO", "female"]), (1, ["B1", "L", "T", "female"])]) c = DummyCallback() f = Utilities.CSVFileIterator("tests/_td/dosage_set_1/set.sample", header="ID POP GROUP SEX") f.iterate(c) self.assertEqual(c.lines, [(0, ["ID1", "K", "HERO", "male"]), (1, ["ID2", "K", "HERO", "female"]), (2, ["DI5", "K", "HERO", "male"]), (3, ["ID3", "K", "HERO", "female"]), (4, ["B1", "L", "T", "female"])]) c = DummyCallback() f = Utilities.CSVFileIterator( "tests/_td/dosage_set_1/set_chr1.legend.gz", header="a", compressed=True) with self.assertRaises(Exceptions.MalformedInputFile): f.iterate(c) c = DummyCallback() f = Utilities.CSVFileIterator( "tests/_td/dosage_set_1/set_chr1.legend.gz", compressed=True) f.iterate(c) self.assertEqual( c.lines, [(0, [ "id", "position", "a0", "a1", "TYPE", "AFR", "AMR", "EAS", "EUR", "SAS", "ALL" ]), (1, [ "1:10177:A:AC", "10177", "A", "AC", "Biallelic_INDEL", "0.490922844175492", "0.360230547550432", "0.336309523809524", "0.405566600397614", "0.494887525562372", "0.425319488817891" ]), (2, [ "rs1:1:A:T", "10505", "A", "T", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (3, [ "1:12:C:G", "10506", "C", "G", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (4, [ "rs2:2:G:A", "10511", "G", "A", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (5, [ "rs3:3:C:T", "10511", "G", "A", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (6, [ "rs4:4:C:T", "10511", "G", "A", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ])]) c = DummyCallback() f = Utilities.CSVFileIterator( "tests/_td/dosage_set_1/set_chr1.legend.gz", header="id position a0 a1 TYPE AFR AMR EAS EUR SAS ALL", compressed=True) f.iterate(c) self.assertEqual( c.lines, [(0, [ "1:10177:A:AC", "10177", "A", "AC", "Biallelic_INDEL", "0.490922844175492", "0.360230547550432", "0.336309523809524", "0.405566600397614", "0.494887525562372", "0.425319488817891" ]), (1, [ "rs1:1:A:T", "10505", "A", "T", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (2, [ "1:12:C:G", "10506", "C", "G", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (3, [ "rs2:2:G:A", "10511", "G", "A", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (4, [ "rs3:3:C:T", "10511", "G", "A", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ]), (5, [ "rs4:4:C:T", "10511", "G", "A", "Biallelic_SNP", "0", "0", "0", "0", "0", "0" ])])