def constructColName2IndexFromHeader(self): """ 2012.8.23 """ self.header = self.next() self.col_name2index = utils.getColName2IndexFromHeader(self.header) return self.col_name2index
def run(self): """ """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() isq_id2data ={} no_of_total_lines = 0 no_of_isqf_lines = 0 no_of_isqf_in_db = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True) isq_id_index = colName2Index.get('isq_id') isqf_id_index = colName2Index.get('isqf_id') read_count_index = colName2Index.get("read_count") base_count_index = colName2Index.get("base_count") for row in reader: isq_id = int(row[isq_id_index]) isqf_id = row[isqf_id_index] read_count = int(row[read_count_index]) base_count = int(row[base_count_index]) if isq_id not in isq_id2data: isq_id2data[isq_id] = PassingData(read_count=0, base_count=0) isq_id2data[isq_id].read_count += read_count isq_id2data[isq_id].base_count += base_count if isqf_id and isqf_id!='0': isqf_id = int(isqf_id) no_of_isqf_lines += 1 no_of_isqf_in_db += self.updateIndividualSequenceFileReadBaseCount(self.db_vervet, isqf_id=isqf_id, \ read_count=read_count, base_count=base_count) no_of_total_lines += 1 del reader logMsg1="%s isqf out of %s were put into db. %s lines in total.\n"%(no_of_isqf_in_db, no_of_isqf_lines, no_of_total_lines) sys.stderr.write(logMsg1) counter = 0 real_counter = 0 for isq_id, data in isq_id2data.iteritems(): real_counter += self.updateIndividualSequenceReadBaseCount(self.db_vervet, isq_id=isq_id, \ read_count=data.read_count, base_count=data.base_count, genomeSize=self.genomeSize) counter += 1 logMsg2="%s isq out of %s were put into db.\n"%(real_counter, counter) sys.stderr.write(logMsg2) if self.logFilename: logF = open(self.logFilename, 'w') logF.write(logMsg1) logF.write(logMsg2) del logF if self.commit: self.db_vervet.session.flush() self.db_vervet.session.commit()
def run(self): """ 2012.4.3 each input has this as its header: ['alignmentID', 'total_no_of_reads', 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \ 'perc_both_mates_mapped', 'perc_singletons',\ 'perc_mapped_to_diff_chrs'] """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() no_of_total_lines = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True) alignment_id_index = colName2Index.get('alignmentID') total_no_of_reads_index = colName2Index.get('total_no_of_reads') perc_reads_mapped_index = colName2Index.get("perc_reads_mapped") perc_duplicates_index = colName2Index.get("perc_duplicates") perc_paired_index = colName2Index.get("perc_paired") perc_properly_paired_index = colName2Index.get("perc_properly_paired") perc_both_mates_mapped_index = colName2Index.get("perc_both_mates_mapped") perc_singletons_index = colName2Index.get("perc_singletons") perc_mapped_to_diff_chrs_index = colName2Index.get("perc_mapped_to_diff_chrs") perc_mapq5_mapped_to_diff_chrs_index = colName2Index.get("perc_mapq5_mapped_to_diff_chrs") for row in reader: alignmentID = int(row[alignment_id_index]) alignment = VervetDB.IndividualAlignment.get(alignmentID) alignment.perc_reads_mapped = float(row[perc_reads_mapped_index]) alignment.perc_duplicates = float(row[perc_duplicates_index]) alignment.perc_paired = float(row[perc_paired_index]) alignment.perc_properly_paired = float(row[perc_properly_paired_index]) alignment.perc_both_mates_mapped = float(row[perc_both_mates_mapped_index]) alignment.perc_singletons = float(row[perc_singletons_index]) alignment.perc_mapped_to_diff_chrs = float(row[perc_mapped_to_diff_chrs_index]) alignment.perc_mapq5_mapped_to_diff_chrs = float(row[perc_mapq5_mapped_to_diff_chrs_index]) alignment.total_no_of_reads = int(float(row[total_no_of_reads_index])) session.add(alignment) no_of_total_lines += 1 del reader sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines)) if self.logFilename: logF = open(self.logFilename, 'w') logF.write("%s alignments in total.\n"%(no_of_total_lines)) del logF if self.commit: self.db_vervet.session.flush() self.db_vervet.session.commit()
def _parseHeader(self): """ 2013.07.17 bugfix, do not reset self.sample_id_ls in the beginning 2012.3.28 add all header content into self.metaInfoLs except the last header line, which goes into self.sampleIDHeader 2011-11-2 this function is run inside __init__() """ self.metaInfoLs = [] #2012.3.28 anything before the "#CHROM" line. each entry is a raw line content, including '\n' self.sampleIDHeader = [] #2012.3.20 a list of column headers (#CHROM) self.sample_id2index['ref'] = 0 #ref is at column 0. "ref" must not be equal to any read_group. self.sample_id_ls.append('ref') """ writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['sample', 'snp_id', 'chr', 'pos', 'qual', 'DP', 'minDP4', 'DP4_ratio', 'MQ'] moreHeader = ['GQ', 'GL', 'SB', 'QD', 'sndHighestGL', 'deltaGL'] #['AF', 'AC','AN', 'Dels', 'HRun', 'HaplotypeScore','MQ0', 'QD'] #2011-3-4 useless if VCFOutputType==2: header += moreHeader chr_pure_number_pattern = re.compile(r'[a-z_A-Z]+(\d+)') chr_number_pattern = re.compile(r'chr(\d+)') """ counter = 0 real_counter = 0 for line in self.inf: if line[:6]=='#CHROM': line = line.strip() #get rid of the trailing \n row = line.split('\t') self.sampleIDHeader = row[self.sampleStartingColumn:] self.header = row[:] self.headerWithoutHash= row[:] self.headerWithoutHash[0] = 'CHROM' #discard the # self.col_name2index = getColName2IndexFromHeader(self.headerWithoutHash, skipEmptyColumn=True) self.col_index_individual_name_ls = self._getIndividual2ColIndex(self.headerWithoutHash, self.col_name2index) for individual_col_index, individual_name in self.col_index_individual_name_ls: read_group = individual_name.strip() if read_group not in self.sample_id2index: self.sample_id2index[read_group] = len(self.sample_id2index) self.sample_id_ls.append(read_group) break # "#CHROM" is the last line of the self.headerWithoutHash elif line[0]=='#': #2011-3-4 self.metaInfoLs.append(line) #continue else: #leave everything for parseFile or parseIter break
def processHeader(self, reader=None, extendHeader=None, chrLengthHeader = 'chrLength'): """ 2012.8.7 modularize so that AddHetFractionToVCFtoolsHWE could inherit """ header = reader.next() self.originalHeader = header self.col_name2index = utils.getColName2IndexFromHeader(self.originalHeader, skipEmptyColumn=True) self.originalHeaderLength = len(header) header.extend(extendHeader) if self.divideByLength: i = self.divideStartingColumn while (i<self.originalHeaderLength): statColumnHeader = header[i] header.append("%s_div_by_%s"%(statColumnHeader, chrLengthHeader)) i += 1; return header
def getBamBaseFname2MonkeyID_WUSTLDNAData(self, inputFname, ): """ 2011-8-3 from WUSTL the input looks like this: # FlowCell Lane Index Sequence Library Common Name Bam Path MD5 1 64J6AAAXX 1 VCAC-2007002-1-lib1 African Green Monkey /gscmnt/sata755/production/csf_111215677/gerald_64J6AAAXX_1.bam /gscmnt/sata755/production/csf_111215677/gerald_64J6AAAXX_1.bam.md5 2 64J6AAAXX 2 VCAC-2007006-1-lib1 African Green Monkey /gscmnt/sata751/production/csf_111215675/gerald_64J6AAAXX_2.bam /gscmnt/sata751/production/csf_111215675/gerald_64J6AAAXX_2.bam.md5 """ sys.stderr.write("Getting bamBaseFname2MonkeyID dictionary ...") bamBaseFname2MonkeyID = {} reader = csv.reader(open(inputFname), delimiter='\t') header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) monkeyIDIndex = col_name2index.get("Library") if monkeyIDIndex is None: #2012.6.7 monkeyIDIndex = col_name2index.get("library") bamFnameIndex = col_name2index.get("Bam Path") if bamFnameIndex is None: #2012.2.9 bamFnameIndex = col_name2index.get("BAM Path") if bamFnameIndex is None: #2012.2.9 bamFnameIndex = col_name2index.get("BAM") if bamFnameIndex is None: #2012.6.7 bamFnameIndex = col_name2index.get("bam pathway") #monkeyIDPattern = re.compile(r'\w+-(\w+)-\d+-\w+') # i.e. VCAC-2007002-1-lib1 monkeyIDPattern = re.compile(r'\w+-(\w+)-\w+-\w+') # 2012.5.29 i.e. VCAC-VGA00006-AGM0075-lib1 , # VCAC-VZC1014-AGM0055-lib1, VCAC-1996031-VRV0265-lib2a, VCAC-VKD7-361-VKD7-361-lib1 (VKD7 is to be taken), for row in reader: monkeyID = row[monkeyIDIndex] pa_search = monkeyIDPattern.search(monkeyID) if pa_search: monkeyID = pa_search.group(1) else: sys.stderr.write("Warning: could not parse monkey ID from %s. Ignore.\n"%(monkeyID)) continue bamFname = row[bamFnameIndex] bamBaseFname = os.path.split(bamFname)[1] bamBaseFname2MonkeyID[bamBaseFname] = monkeyID sys.stderr.write("%s entries.\n"%(len(bamBaseFname2MonkeyID))) return bamBaseFname2MonkeyID
def smartReadHeader(self, headerPattern=None, commentPattern=None): """ Note: If an input file does not have a header, this function over-reads by one line (stored in self._row) so need to process the last self._row before further reading 2013.08.30 read the header, while ignoring lines fitting the comment pattern and construct col_name2index when a line matching headerPattern is encountered """ if headerPattern is None: headerPattern = self.headerPattern if commentPattern is None: commentPattern = self.commentPattern row = self.next() while commentPattern.search(row[0]): #passing all comments self.comment_row_list.append(row) row = self.next() if headerPattern.search(row[0]): self.header = row self.col_name2index = utils.getColName2IndexFromHeader(self.header) else: self.col_name2index = None return self.col_name2index
def discoverFromVCF(cls, inputFname, outputFname, refFastaFname=None, VCFOutputType=2, \ minMinorAlleleCoverage=1/4., maxMinorAlleleCoverage=3/4.,\ maxNoOfReads=2., minNoOfReads=1/4., \ maxNoOfReadsForGenotypingError=1, maxMajorAlleleCoverage=7/8., maxNoOfReadsForAllSamples=1000,\ nt_set = set(['a','c','g','t','A','C','G','T']), isqID2coverage=None, defaultCoverage=10, \ outputDelimiter='\t',\ report=0, site_type=1): """ 2011-9-2 add argument isqID2coverage, defaultCoverage 2011-8-26 add argument site_type function is also more robust against missing fields etc. 2011-7-20 copied from discoverHetsFromVCF() of vervet.src.misc 2011-3-24 add maxMinorAlleleCoverage Even a heterozygote's MAC is within [minMinorAlleleCoverage, maxMinorAlleleCoverage], it could still be a homozygous SNP. 2011-3-4 VCF output by GATK has a different format argument VCFOutputType 1: output by samtools's vcfutils.pl 2: output by GATK 2011-1-6 inputFname is VCF output by "vcfutils.pl varFilter" of samtools """ import csv from pymodule.utils import runLocalCommand, getColName2IndexFromHeader sys.stderr.write("Looking for heterozygous SNPs in %s (%s<=MAC<=%s).\n"%(os.path.basename(inputFname), \ minMinorAlleleCoverage, maxMinorAlleleCoverage)) reader =csv.reader(open(inputFname), delimiter='\t') read_group2col_index = {'ref':0} #ref is at column 0. "ref" must not be equal to any read_group. read_group2coverage = {} #2011-9-2 locus_id2row_index = {} data_matrix = [] tid2refName = {} #dictionary storing the target references which have SNP calls refNameSet = set() """ writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['sample', 'snp_id', 'chr', 'pos', 'qual', 'DP', 'minDP4', 'DP4_ratio', 'MQ'] moreHeader = ['GQ', 'GL', 'SB', 'QD', 'sndHighestGL', 'deltaGL'] #['AF', 'AC','AN', 'Dels', 'HRun', 'HaplotypeScore','MQ0', 'QD'] #2011-3-4 useless if VCFOutputType==2: header += moreHeader chr_pure_number_pattern = re.compile(r'[a-z_A-Z]+(\d+)') chr_number_pattern = re.compile(r'chr(\d+)') """ individual_name2col_index = None col_name2index = None counter = 0 real_counter = 0 for row in reader: if row[0] =='#CHROM': row[0] = 'CHROM' #discard the # header = row col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) individual_name2col_index = cls.getIndividual2ColIndex(header, col_name2index) continue elif row[0][0]=='#': #2011-3-4 continue """ if chr_number_pattern.search(row[0]): chr = chr_number_pattern.search(row[0]).group(1) elif chr_pure_number_pattern.search(row[0]): chr = chr_pure_number_pattern.search(row[0]).group(1) else: sys.stderr.write("Couldn't parse the chromosome number/character from %s.\n Exit.\n"%(row[0])) sys.exit(4) """ chr = row[0] refNameSet.add(chr) pos = row[1] quality = row[5] outputHet= False info = row[7] info_ls = info.split(';') info_tag2value = {} for info in info_ls: try: tag, value = info.split('=') except: #sys.stderr.write("Error in splitting %s by =.\n"%info) ###Error in splitting DS by =. continue info_tag2value[tag] = value current_locus = '%s_%s'%(chr, pos) refBase = row[col_name2index['REF']] altBase = row[col_name2index['ALT']] if VCFOutputType==2: #2011-3-4 GATK format_column = row[col_name2index['FORMAT']] format_column_ls = format_column.split(':') format_column_name2index = getColName2IndexFromHeader(format_column_ls) data_row = ['NA']*(len(individual_name2col_index)+1) # extra 1 for the ref allele2count = {} for individual_name, individual_col_index in individual_name2col_index.iteritems(): read_group = individual_name if read_group not in read_group2col_index: read_group2col_index[read_group] = len(read_group2col_index) #2011-9-2 if isqID2coverage: try: isqID = read_group.split('_')[1] isqID = int(isqID) coverage = isqID2coverage.get(isqID, defaultCoverage) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() sys.stderr.write("Coverage for %s not available. use default=%s.\n"%(read_group, defaultCoverage)) coverage = defaultCoverage else: coverage = defaultCoverage read_group2coverage[read_group] = coverage coverage = read_group2coverage[read_group] genotype_data = row[individual_col_index] genotype_data_ls = genotype_data.split(':') genotype_call_index = format_column_name2index.get('GT') genotype_quality_index = format_column_name2index.get('GQ') if genotype_quality_index is None: genotype_quality_index = format_column_name2index.get('DP') depth_index = format_column_name2index.get("DP") #GL_index = format_column_name2index.get('GL') if len(genotype_data_ls)<len(format_column_name2index): continue if depth_index is None or genotype_call_index is None: continue #genotype_quality = genotype_data_ls[genotype_quality_index] genotype_call = genotype_data_ls[genotype_call_index] depth = int(genotype_data_ls[depth_index]) if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage: #2011-3-29 skip. coverage too high or too low continue allele = 'NA' if genotype_call=='0/1' or genotype_call =='1/0': #heterozygous, the latter notation is never used though. """ GL_list = genotype_data_ls[GL_index] GL_list = GL_list.split(',') GL_list = map(float, GL_list) GL = GL_list[1] sndHighestGL = max([GL_list[0], GL_list[2]]) deltaGL = GL-sndHighestGL """ AD = genotype_data_ls[format_column_name2index.get('AD')] AD = map(int, AD.split(',')) minorAlleleCoverage = min(AD) majorAlleleCoverage = max(AD) if minorAlleleCoverage<=maxMinorAlleleCoverage*coverage and minorAlleleCoverage>=minMinorAlleleCoverage*coverage \ and majorAlleleCoverage<=maxMajorAlleleCoverage*coverage: DP4_ratio = float(AD[0])/AD[1] allele = '%s%s'%(refBase, altBase) """ data_row = [individual_name, 'chr%s:%s'%(chr, pos), chr, pos, quality, \ depth, minorAlleleCoverage, DP4_ratio,\ info_tag2value.get('MQ'), genotype_quality, GL,\ info_tag2value.get('SB'), info_tag2value.get('QD'), sndHighestGL, deltaGL] #for i in range(3, len(moreHeader)): # info_tag = moreHeader[i] # data_row.append(info_tag2value.get(info_tag)) writer.writerow(data_row) """ elif genotype_call=='./.': #missing continue elif genotype_call =='1/1': allele = '%s%s'%(altBase, altBase) elif genotype_call =='0/0': allele = '%s%s'%(refBase, refBase) col_index = read_group2col_index.get(read_group) data_row[col_index] = allele if allele!='NA': if allele not in allele2count: allele2count[allele] = 0 allele2count[allele] += 1 if len(allele2count)>site_type-1: #whether polymorphic across samples or all sites in vcf real_counter += 1 locus_id2row_index[current_locus] = len(locus_id2row_index) data_matrix.append(data_row) """ elif VCFOutputType==1: #samtools. 2011-7-20 outdated. sample_id = row[8] for tag in info_tag2value.keys(): value = info_tag2value.get(tag) if tag=='DP4': tag = 'DP4_ratio' value = value.split(',') value = map(int, value) no_of_ref_allele = sum(value[0:2]) no_of_non_ref_allele = sum(value[2:]) MAC = min(no_of_ref_allele, no_of_non_ref_allele) if MAC<=maxMinorAlleleCoverage and MAC>=minMinorAlleleCoverage: outputHet = True value = float(no_of_ref_allele)/no_of_non_ref_allele info_tag2value['minDP4'] = min(no_of_ref_allele, no_of_non_ref_allele) else: value = None info_tag2value[tag] = value if outputHet: real_counter += 1 output_row = [sample_id, 'chr%s:%s'%(chr, pos), chr, pos, quality, info_tag2value.get('DP'), \ info_tag2value.get('minDP4'), info_tag2value.get('DP4_ratio'), info_tag2value.get('MQ')] writer.writerow(output_row) """ counter += 1 if counter%2000==0 and report: sys.stderr.write("%s\t%s\t%s"%("\x08"*80, counter, real_counter)) del reader cls.outputCallMatrix(data_matrix, refFastaFname, outputFname=outputFname, refNameSet=refNameSet, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=outputDelimiter) sys.stderr.write("%s\t%s\t%s.\n"%("\x08"*80, counter, real_counter))
def run(self): """ 2012.5.7 new input looks like this (tab-delimited): alignmentID total_base_count sampled_base_count meanDepth medianDepth modeDepth 100 1005506 301614 70.0441756682 9.0 8.0 27 1005506 301614 70.0441756682 9.0 8.0 2012.4.3 each input looks like this: sample_id total mean granular_third_quartile granular_median granular_first_quartile %_bases_above_15 553_2_VRC_ref_GA_vs_524 2434923137 8.25 11 9 6 4.4 Total 2434923137 8.25 N/A N/A N/A 554_3_Barbados_GA_vs_524 2136011136 7.23 11 8 6 3.5 Total 2136011136 7.23 N/A N/A N/A ... """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() no_of_total_lines = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True) sample_id_index = col_name2index.get("alignmentID") total_base_count_index = col_name2index.get('total_base_count') mean_depth_index = col_name2index.get("meanDepth") median_depth_index = col_name2index.get("medianDepth") mode_depth_index = col_name2index.get("modeDepth") for row in reader: sample_id = row[sample_id_index] if sample_id=='Total': #ignore rows with this as sample id continue alignment_id = int(sample_id.split("_")[0]) total_base_count = int(row[total_base_count_index]) mean_depth = float(row[mean_depth_index]) median_depth = float(row[median_depth_index]) mode_depth = float(row[mode_depth_index]) individual_alignment = VervetDB.IndividualAlignment.get(alignment_id) individual_alignment.pass_qc_read_base_count = total_base_count #2012.9.17 no longer trustworthy because CalculateMedianMeanOfInputColumn skips data. individual_alignment.mean_depth = mean_depth individual_alignment.median_depth = median_depth individual_alignment.mode_depth = mode_depth session.add(individual_alignment) no_of_total_lines += 1 del reader sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines)) if self.logFilename: logF = open(self.logFilename, 'w') logF.write("%s alignments in total.\n"%(no_of_total_lines)) del logF if self.commit: self.db_vervet.session.flush() self.db_vervet.session.commit()
def parseOneVCFRow(row, col_name2index, col_index_individual_name_ls, sample_id2index, minDepth=1,\ dataEntryType=1): """ 2014.01.08 fix a bug that skips calls and shortens data_row. 2012.9.6 turn pos into integer 2012.5.10 complete representation of one locus 2012.1.17 common snippet split out of VCFFile & VCFRecord row is a list of input columns from one VCF file line dataEntryType 1: each cell is base call 2: each cell is a dictionary {'GT': base-call, 'DP': depth} """ chromosome = row[0] pos = int(row[1]) #2012.9.6 turn pos into integer vcf_locus_id=row[2] quality = row[5] filter=row[6] info = row[7] format = row[8] info_ls = info.split(';') info_tag2value = {} for info_entry in info_ls: try: tag, value = info_entry.split('=') except: #sys.stderr.write("Error in splitting %s by =.\n"%info) ###Error in splitting DS by =. continue info_tag2value[tag] = value locus_id = (chromosome, pos) refBase = row[col_name2index['REF']] altBase = row[col_name2index['ALT']] altBaseLs = altBase.split(',') #altBase could be just "C" or "C,G" (multi-nucleotide) alleleLs = [refBase] + altBaseLs alleleNumber2Base = {'.':'NA'} for i in xrange(len(alleleLs)): alleleNumber2Base[repr(i)] = alleleLs[i] format_column = row[col_name2index['FORMAT']] format_column_ls = format_column.split(':') format_column_name2index = getColName2IndexFromHeader(format_column_ls) if dataEntryType==1: data_row = ['NA']*(len(col_index_individual_name_ls)+1) # extra 1 for the ref data_row[0] = refBase else: data_row = [None]*(len(col_index_individual_name_ls)+1) # extra 1 for the ref data_row[0] = {'GT':refBase, 'DP':-1} genotypeCall2Count = {} for individual_col_index, individual_name in col_index_individual_name_ls: individual_name = individual_name if individual_name not in sample_id2index: sample_id2index[individual_name] = len(sample_id2index) #coverage = read_group2coverage[individual_name] genotype_data = row[individual_col_index] genotype_data_ls = genotype_data.split(':') genotype_call_index = format_column_name2index.get('GT') genotype_quality_index = format_column_name2index.get('GQ') if genotype_quality_index is None: genotype_quality_index = format_column_name2index.get('DP') depth_index = format_column_name2index.get("DP") #GL_index = format_column_name2index.get('GL') genotypeCallInBase = 'NA' if genotype_call_index is not None and len(genotype_data_ls)>0: # or (genotype_call_index is not None and len(genotype_data_ls)<=genotype_call_index): #<len(format_column_name2index): #this genotype call is probably empty "./." due to no reads #genotype_quality = genotype_data_ls[genotype_quality_index] if genotype_call_index is not None and len(genotype_data_ls)>genotype_call_index: genotype_call = genotype_data_ls[genotype_call_index] else: genotype_call = './.' #missing callData = {} if genotype_call!='./.' and genotype_call!='.' and genotype_call!='.|.': #missing data patternSearchResult = diploidGenotypePattern.search(genotype_call) if patternSearchResult: allele1 = alleleNumber2Base[patternSearchResult.group(1)] allele2 = alleleNumber2Base[patternSearchResult.group(2)] if allele1!='N' and allele2!='N': genotypeCallInBase = '%s%s'%(allele1, allele2) if depth_index is not None: if len(genotype_data_ls)>depth_index: depth = genotype_data_ls[depth_index] else: depth = '.' #missing DP if depth=='.': #this means depth=0 depth = 0 else: depth = int(depth) if minDepth>0 and depth<minDepth: #no read. samtools would still assign ref/ref to this individual genotypeCallInBase = 'NA' #set it to missing #if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage: #2011-3-29 skip. coverage too high or too low # continue callData['DP'] = depth """ if genotype_call=='0/1' or genotype_call =='1/0': #heterozygous, the latter notation is never used though. allele = '%s%s'%(refBase, altBase) GL_list = genotype_data_ls[GL_index] GL_list = GL_list.split(',') GL_list = map(float, GL_list) GL = GL_list[1] sndHighestGL = max([GL_list[0], GL_list[2]]) deltaGL = GL-sndHighestGL AD = genotype_data_ls[format_column_name2index.get('AD')] AD = map(int, AD.split(',')) minorAlleleCoverage = min(AD) majorAlleleCoverage = max(AD) if minorAlleleCoverage<=minorAlleleDepthUpperBoundCoeff*coverage and \ minorAlleleCoverage>=minorAlleleDepthLowerBoundCoeff*coverage and \ majorAlleleCoverage<=majorAlleleDepthUpperBoundCoeff*coverage: DP4_ratio = float(AD[0])/AD[1] allele = '%s%s'%(refBase, altBase) elif genotype_call=='./.' or genotype_call=='.|.': #missing allele = 'NA' elif genotype_call =='1/1' or genotype_call =='1|1': allele = '%s%s'%(altBase, altBase) elif genotype_call =='0/0' or genotype_call=='0|0': allele = '%s%s'%(refBase, refBase) """ col_index = sample_id2index.get(individual_name) if dataEntryType==1: data_row[col_index] = genotypeCallInBase else: callData['GT'] = genotypeCallInBase data_row[col_index] = callData if genotypeCallInBase!='NA': if genotypeCallInBase not in genotypeCall2Count: genotypeCall2Count[genotypeCallInBase] = 0 genotypeCall2Count[genotypeCallInBase] += 1 return PassingData(chr=chromosome, chromosome=chromosome, pos=pos, position=pos, locus_id=locus_id, quality=quality, \ info_tag2value=info_tag2value, \ refBase=refBase, altBase=altBase, \ alleleLs=alleleLs, alleleNumber2Base=alleleNumber2Base, genotypeCall2Count=genotypeCall2Count, data_row=data_row,\ info=info, format=format, filter=filter, vcf_locus_id=vcf_locus_id, \ format_column_name2index=format_column_name2index, format_column_ls=format_column_ls)