def create_file(self): ''' Loops through the snp files and creates a table where each row is a person and each column is a person. The cells contain the number of differences found between them. Future work - change the number so that it is a measure of similarity instead of a measure of difference. ''' headerFields = [FIELD_PERSONID] srcFileNames = os.listdir(self.inputDirectory) for srcFileName in srcFileNames: srcFile = vcffile.VcfFile(srcFileName) personId = srcFile.get_person_id() headerFields.append(personId) with open(self.filename, 'w') as destFile: writer = csv.DictWriter(destFile, fieldnames=headerFields, lineterminator='\n') writer.writeheader() countOfSrcFiles = 0 for srcFileName in srcFileNames: print srcFileName srcFile = vcffile.VcfFile(self.inputDirectory + srcFileName) personId = srcFile.get_person_id() rowOut = {FIELD_PERSONID: personId} snpsAndAlleles = srcFile.get_all_snps_and_alleles() snpsAndAlleles = sorted(snpsAndAlleles) for compareFileName in srcFileNames: compareFile = vcffile.VcfFile(self.inputDirectory + compareFileName) comparePerson = compareFile.get_person_id() if (personId == comparePerson): countDiffs = 0 else: print ' ' + compareFileName compareSnpsAndAlleles = compareFile.get_all_snps_and_alleles( ) compareSnpsAndAlleles = sorted(compareSnpsAndAlleles) countDiffs = self.count_diffs(snpsAndAlleles, compareSnpsAndAlleles) rowOut[comparePerson] = countDiffs writer.writerow(rowOut) countOfSrcFiles += 1 print "Wrote " + str(countOfSrcFiles) + " to " + self.filename
def write_one_person_to_file(self, srcFileName, writer): ''' Gets the alleles for the risk snps from srcFile and writes them to the output file ''' if (self.riskSnps.len() == 0): self.riskSnps.read_from_file() srcData = vcffile.VcfFile(srcFileName) personId = srcData.get_person_id() riskAlleles = srcData.get_these_risksnps(self.riskSnps) rowOut = { FIELD_INDEX: 0, FIELD_PERSONID: personId, FIELD_SNPID: 0, FIELD_ALLELE: 0, FIELD_ODDSRATIO: 0 } index = 0 riskSnpsThisPerson = 0 for allele in riskAlleles: if (allele != '0'): rowOut[FIELD_INDEX] = index rowOut[FIELD_SNPID] = self.riskSnps.snps[index] rowOut[FIELD_ALLELE] = allele rowOut[FIELD_ODDSRATIO] = self.riskSnps.oddsratio[index] writer.writerow(rowOut) self._recordCount += 1 riskSnpsThisPerson += 1 index += 1 print srcFileName + ' ' + str(riskSnpsThisPerson) + ' risk snps'
def test_get_an_allele_number(self): '''get_an_allele_number should convert from a character allele to a number''' inputfile = vcffile.VcfFile() alleleNumber = inputfile.get_an_allele_number('A', 'G') self.assertEqual('1', alleleNumber) alleleNumber = inputfile.get_an_allele_number('G', 'G') self.assertEqual('4', alleleNumber)
def test_get_an_allele(self): '''VcfFile.get_an_allele should return the snps allele''' self.assertTrue(os.path.exists(SAMPLEFILENAME)) inputfile = vcffile.VcfFile(SAMPLEFILENAME) firstSnpLine = inputfile.get_first_snp_line() firstSnpAllele = inputfile.get_an_allele(firstSnpLine) self.assertEqual('A', firstSnpAllele)
def test_get_first_snp_line(self): '''VcfFile.get_first_snp_line should return the first snp in the file''' self.assertTrue(os.path.exists(SAMPLEFILENAME)) inputfile = vcffile.VcfFile(SAMPLEFILENAME) firstSnpLine = inputfile.get_first_snp_line() firstSnp = inputfile.get_a_snp_id(firstSnpLine) self.assertEqual('rs12028261', firstSnp)
def test_vcffile_get_these_risksnps(self): ''' VcfFile.get_these_risksnps should return the alleles for the specified snps ''' riskSnps = risksnps.RiskSnps() riskSnps.set_snps([ 'rs102275', 'rs3764147', 'rs7927997', 'rs415890', 'rs4077515', 'rs3810936', 'rs2476601', 'rs3792109' ]) riskSnps.set_alleles(['C', 'G', 'T', 'C', 'T', 'C', 'G', 'A']) snpDataFile = vcffile.VcfFile(SAMPLEFILENAME) alleles = snpDataFile.get_these_risksnps(riskSnps) self.assertEqual(riskSnps.len(), len(alleles)) self.assertEqual('4', alleles[0]) self.assertEqual('4', alleles[1])
def get_one_person_from_file(self, srcFileName): ''' Returns a comma separated string that's one row for the table: one person's risk alleles. ''' print srcFileName srcData = vcffile.VcfFile(srcFileName) personId = srcData.get_person_id() rowOut = {FIELD_PERSONID:personId} riskAlleles = srcData.get_these_risksnps(self.riskSnps) riskSnpIndex = 0 for allele in riskAlleles: riskSnp = self.riskSnps.snps[riskSnpIndex] rowOut[riskSnp] = allele riskSnpIndex += 1 return rowOut
def test_get_these_risksnps(self): ''' VcfFile.get_these_risksnps should return a list of allele numbers. Note that they will usually be 4s because 4 represents the risk allele and in this dataset, if a person has an allele that is different from the reference genome, and it is for one of the risk snps, it is usually, but not always the risk allele. ''' riskSnps = risksnps.RiskSnps() riskSnps.set_snps([ 'rs102275', 'rs3764147', 'rs7927997', 'rs415890', 'rs4077515', 'rs3810936', 'rs2476601', 'rs3792109' ]) riskSnps.set_alleles(['C', 'G', 'T', 'C', 'T', 'C', 'G', 'A']) inputfile = vcffile.VcfFile(SAMPLEFILENAME) alleleNumbers = inputfile.get_these_risksnps(riskSnps) self.assertEqual(riskSnps.len(), len(alleleNumbers)) self.assertEqual('4', alleleNumbers[0]) self.assertEqual('4', alleleNumbers[1])
def write_one_person_to_file(self, srcFileName, writer): ''' Gets the alleles for the snps from srcFile and writes them to the output file ''' srcData = vcffile.VcfFile(srcFileName) personId = srcData.get_person_id() snpsAndAlleles = srcData.get_all_snps_and_alleles() recordCount = 0 rowOut = {FIELD_PERSONID: personId, FIELD_SNPID: 0, FIELD_ALLELE: 0} for snpAndAllele in snpsAndAlleles: if (snpAndAllele[1] != '0'): rowOut[FIELD_SNPID] = snpAndAllele[0] rowOut[FIELD_ALLELE] = snpAndAllele[1] writer.writerow(rowOut) #lineOut = personId + ',' + snpAndAllele[0] + ',' + snpAndAllele[1] + '\n' #destFile.write(lineOut) recordCount += 1 print srcFileName + ' wrote ' + str( recordCount) + ' records to ' + self.filename
def test_person_id(self): '''VcfFile.get_person_id should pull the person id from the file name''' self.assertTrue(os.path.exists(SAMPLEFILENAME)) inputfile = vcffile.VcfFile(SAMPLEFILENAME) personid = inputfile.get_person_id() self.assertEqual("A0024", personid)