def loadDataFromVCF(): 'constructs list of sample objects, each with a genotype dictionary' vcfFN = Sample.args.dataFN vcfFile, vcfReader = utils.getCSVreader(vcfFN, delimiter='\t') Sample.setSampleListFromVCFheader(vcfReader) Sample.errAndLog('%sReading genotype data...\n %s\n\n' % \ (utils.DASHES, vcfFN)) for lineList in vcfReader: position = int(lineList[1]) if position in Sample.tree.snpPosSet: genoList = lineList[Sample.config.vcfStartCol:] for sample in Sample.sampleList: genotype = genoList[sample.sampleIndex].split(':')[0] if genotype == Sample.config.missingGenotype: continue elif Sample.config.runFromVCF: # as opposed to .vcf4 ref, alt = lineList[3:5] if genotype == '0': genotype = ref elif genotype == '1': genotype = alt sample.addGeno(position, genotype) vcfFile.close()
def processSampleMajorTxtandCallHaplogroups(): ''' reads in sample major data, calling haplogroup for each line. returns list of sample objects with genotype data purged. assumed format: row 1 = physical coordinates column 1 = sample ID ''' genoFN = Sample.args.dataFN genoFile, genoReader = utils.getCSVreader(genoFN, delimiter='\t') Sample.errAndLog('%sReading genotype data:\n %s\n\n' % \ (utils.DASHES, genoFN)) # determine relevant physical coordinates and corresponding columns allPositionsList = [ int(position) for position in genoReader.next()[1:] ] columnPositionTupleList = list() for column, position in enumerate(allPositionsList): if position in Sample.tree.snpPosSet: columnPositionTupleList.append((column, position)) # read genotypes, call haplogroups for genoList in genoReader: ID, genoList = genoList[0], genoList[1:] if Sample.args.singleSampleID and ID != Sample.args.singleSampleID: continue sample = Sample(ID) for column, position in columnPositionTupleList: genotype = genoList[column] if genotype != Sample.config.missingGenotype: sample.addGeno(position, genotype) sample.callHaplogroup() Sample.sampleList.append(sample) genoFile.close()