Пример #1
0
    def loadDataFromVCF():
        'constructs list of sample objects, each with a genotype dictionary'
        
        vcfFN = Sample.args.dataFN
        vcfFile, vcfReader = utils.getCSVreader(vcfFN, delimiter='\t')
        Sample.setSampleListFromVCFheader(vcfReader)

        Sample.errAndLog('%sReading genotype data...\n    %s\n\n' % \
                         (utils.DASHES, vcfFN))

        for lineList in vcfReader:
            position = int(lineList[1])
            if position in Sample.tree.snpPosSet:
                genoList = lineList[Sample.config.vcfStartCol:]
                for sample in Sample.sampleList:
                    genotype = genoList[sample.sampleIndex].split(':')[0]
                    
                    if genotype == Sample.config.missingGenotype:
                        continue
                    elif Sample.config.runFromVCF:      # as opposed to .vcf4
                        ref, alt = lineList[3:5]
                        if genotype == '0':
                            genotype = ref
                        elif genotype == '1':
                            genotype = alt
                            
                    sample.addGeno(position, genotype)

        vcfFile.close()
Пример #2
0
    def processSampleMajorTxtandCallHaplogroups():
        '''
        reads in sample major data, calling haplogroup for each line.
        returns list of sample objects with genotype data purged.

        assumed format:
            row 1    = physical coordinates
            column 1 = sample ID
        '''

        genoFN = Sample.args.dataFN
        genoFile, genoReader = utils.getCSVreader(genoFN, delimiter='\t')
        Sample.errAndLog('%sReading genotype data:\n    %s\n\n' % \
                         (utils.DASHES, genoFN))

        # determine relevant physical coordinates and corresponding columns
        allPositionsList = [
            int(position) for position in genoReader.next()[1:]
        ]
        columnPositionTupleList = list()
        for column, position in enumerate(allPositionsList):
            if position in Sample.tree.snpPosSet:
                columnPositionTupleList.append((column, position))

        # read genotypes, call haplogroups
        for genoList in genoReader:
            ID, genoList = genoList[0], genoList[1:]
            if Sample.args.singleSampleID and ID != Sample.args.singleSampleID:
                continue

            sample = Sample(ID)
            for column, position in columnPositionTupleList:
                genotype = genoList[column]
                if genotype != Sample.config.missingGenotype:
                    sample.addGeno(position, genotype)

            sample.callHaplogroup()
            Sample.sampleList.append(sample)

        genoFile.close()