def addLocusFromVCF2DB(self, db_vervet, inputFname=None, ref_ind_seq_id=None, locus_type_id=None, minDepth=0): """ 2012-5.2 given a VCF file, find all the loci and submit them into db """ sys.stderr.write("Adding loci from %s into db ... "%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) counter = 0 previous_reported_counter = '' for vcfRecord in vcfFile.parseIter(): chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) refBase = vcfRecord.data_row[0].get("GT")[0] refBaseDBEntry = self.getSequenceDBEntry(db_vervet, sequence=refBase, comment=None) altBase = vcfRecord.altBase altBaseDBEntry = self.getSequenceDBEntry(db_vervet, sequence=altBase, comment=None) locus = db_vervet.getLocus(chr=chr, start=pos, stop=pos, ref_seq=refBaseDBEntry, alt_seq=altBaseDBEntry, \ ref_ind_seq_id=ref_ind_seq_id, \ locus_type_id=locus_type_id) counter += 1 if counter%500==0: sys.stderr.write("%s%s"%('\x08'*len(previous_reported_counter), counter)) previous_reported_counter = repr(counter) sys.stderr.write("%s%s"%(len(previous_reported_counter), counter)) sys.stderr.write(" Done.\n")
def run(self): """ """ if self.debug: import pdb pdb.set_trace() snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1) snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([])) if self.min_MAF and self.min_MAF > 0: snpData = SNPData.removeColsByMAF(snpData, min_MAF=self.min_MAF, NA_set=set([])) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.makeupHeaderFromSampleIDList( sampleIDList=snpData.row_id_ls) self.writer.writeMetaAndHeader() counter = 0 for j in xrange(len(snpData.col_id_ls)): snp_id = snpData.col_id_ls[j] chromosome, start = snp_id.split('_')[:2] genotype_ls = snpData.data_matrix[:, j] genotype_ls = utils.dict_map(number2di_nt, genotype_ls) genotype_ls_vcf = [] alleleNucleotide2Number = {} alleleNumber2Nucleotide = {} for genotype in genotype_ls: if genotype == 'NA': genotype_ls_vcf.append("./.") elif len(genotype) == 2: for allele in genotype: if allele not in alleleNucleotide2Number: alleleNumber = len(alleleNucleotide2Number) alleleNucleotide2Number[allele] = alleleNumber alleleNumber2Nucleotide[alleleNumber] = allele genotype_ls_vcf.append( "%s/%s" % (alleleNucleotide2Number[genotype[0]], alleleNucleotide2Number[genotype[1]])) else: genotype_ls_vcf.append("./.") refAllele = alleleNumber2Nucleotide[0] if 1 not in alleleNumber2Nucleotide: altAllele = refAllele else: altAllele = alleleNumber2Nucleotide[1] row = [ chromosome, start, ".", refAllele, altAllele, 999, 'PASS', "DP=100", "GT" ] + genotype_ls_vcf self.writer.writerow(row) counter += 1 sys.stderr.write(" %s records.\n" % (counter)) self.writer.close()
def countHomoHetCallsForEachSampleFromVCF(self, inputFname, outputFname, chromosome=None, chrLength=None, minDepth=1): """ 2011-11-2 given a VCF file, count the number of h**o-ref, h**o-alt, het calls """ sys.stderr.write("Count the number of homozygous-ref/alt & het from %s .\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) sampleID2data = {} #key is sampleID, value is a list of 3 numbers. 'NoOfHomoRef', 'NoOfHomoAlt', 'NoOfHet' no_of_total = 0. minStart = None for vcfRecord in vcfFile.parseIter(): chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) refBase = vcfRecord.data_row[0].get("GT")[0] for sample_id, sample_index in vcfFile.sample_id2index.iteritems(): if sample_id=='ref': #ignore the reference continue if sample_id not in sampleID2data: sampleID2data[sample_id] = [0, 0, 0] if not vcfRecord.data_row[sample_index]: #None for this sample continue callForThisSample = vcfRecord.data_row[sample_index].get('GT') if not callForThisSample or callForThisSample=='NA': continue if callForThisSample[0]==refBase and callForThisSample[1]==refBase: #homozygous reference allele sampleID2data[sample_id][0]+=1 elif callForThisSample[0]==callForThisSample[1] and callForThisSample[0]!=refBase: #homozygous alternative allele sampleID2data[sample_id][1]+=1 elif callForThisSample[0]!=callForThisSample[1]: sampleID2data[sample_id][2]+=1 import csv writer = csv.writer(open(outputFname, 'w'), delimiter='\t') writer.writerow(['#sampleID', 'chromosome', 'length', "NoOfTotal", 'NoOfHomoRef', 'NoOfHomoAlt', "FractionOfHomoAlt", 'NoOfHet', "FractionOfHet"]) sampleIDLs = sampleID2data.keys() sampleIDLs.sort() for sampleID in sampleIDLs: count_data = sampleID2data.get(sampleID) noOfHomoRef, noOfHomoAlt, noOfHet = count_data[:3] no_of_calls = float(sum(count_data)) if no_of_calls>0: fractionOfHomoAlt = noOfHomoAlt/no_of_calls fractionOfHet = noOfHet/no_of_calls else: fractionOfHomoAlt = -1 fractionOfHet = -1 writer.writerow([sampleID, chromosome, chrLength, int(no_of_calls), noOfHomoRef, noOfHomoAlt, \ fractionOfHomoAlt, noOfHet, fractionOfHet]) del writer sys.stderr.write("Done.\n")
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) #query = VervetDB.GenotypeFile.query.filter_by(genotype_method_id=self.genotypeMethodID).filter_by(format=self.format) #for genotypeFile in query: if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(data_dir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['Chromosome', 'position', 'ref'] columnIndexList = [] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.individual_sequence.individual.site #2012.8.29 get scientific name from the taxonomy db scientifcName = self.db_taxonomy.returnScientificNameGivenTaxID(individualAlignment.individual_sequence.individual.tax_id) #if individualAlignment.individual_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \ # and site.country_id!=136 and site.country_id!=148): header.append('%s %s'%(sampleID, scientifcName)) columnIndexList.append(i) writer.writerow(header) for vcfRecord in vcfFile: data_row = [vcfRecord.chr, vcfRecord.pos] refCall = vcfRecord.data_row[0] data_row.append(refCall['GT']) #get alternative allele frequency AF_list = vcfRecord.info_tag2value.get('AF') #info_tag2value['AF'] AF_list = AF_list.split(',') AF_list = map(float, AF_list) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: data_row.append(vcfCall['GT']) else: data_row.append('NA') writer.writerow(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n"%(counter)) del writer
def outputPedigreeForPlink(self, DG=None, db_vervet=None, inputFname=None, outputFname=None, \ treatEveryOneIndependent=None, sampleIDFormat=1,\ addUngenotypedDuoParents=False): """ http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml either space or tab could be the delimiter. sampleIDFormat 1: individual.ucla_id 2: input sampleID argument addUngenotypedDuoParents for mendel error detection, if an ungenotyped parent in a duo (the other is genotyped) is not present in the genotype file (PED/TPED/BED), then plink won't look for its mendel inconsistency 2013.07.18 added argument addUngenotypedDuoParents for mendel error detection, if an ungenotyped parent in a duo is not present in the genotype file (PED/TPED/BED), then plink won't look for its mendel inconsistency 2013.06.24 added argument sampleIDFormat 1: individual.ucla_id 2: alignment.read_group 2013.1.2 copied from run() """ sys.stderr.write("Outputting pedigree constrained by %s to %s, treatEveryOneIndependent=%s, sampleIDFormat=%s, addUngenotypedDuoParents=%s ... "%\ (inputFname, outputFname, treatEveryOneIndependent, sampleIDFormat, addUngenotypedDuoParents)) vcfFile = VCFFile(inputFname=inputFname) alignmentLs = [] alignmentID2sampleData = {} individual_id2alignment = {} for sampleID in vcfFile.getSampleIDList(): alignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment alignmentLs.append(alignment) if alignment.id in alignmentID2sampleData: sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\ (alignment.id, alignment.read_group, sampleID, \ alignmentID2sampleData.get(alignment.id).sampleID)) raise alignmentID2sampleData[alignment.id] = PassingData(sampleID=sampleID, alignment=alignment) individual_id = alignment.individual_sequence.individual_id if individual_id in individual_id2alignment: sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\ (alignment.id, alignment.read_group, sampleID, \ alignmentID2sampleData.get(alignment.id).sampleID)) raise individual_id2alignment[individual_id] = alignment #alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname =inputFname) """ pedigreeGraphData = db_vervet.constructPedgreeGraphOutOfAlignments(alignmentLs) DG = pedigreeGraphData.DG individual_id2alignmentLs = pedigreeGraphData.individual_id2alignmentLs """ individual_id2individual = {} ungenotypedNodeID2Data = {} writer = csv.writer(open(outputFname, 'w'), delimiter=' ') counter = 0 family_id= 1 #all in one family currentNoOfFakes = 0 for alignment in alignmentLs: nodeID = alignment.individual_sequence.individual_id individual = self.getIndividual(db_vervet=db_vervet, individual_id=nodeID, \ individual_id2individual=individual_id2individual) if nodeID in DG: parents = DG.predecessors(nodeID) if len(parents)==2: parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \ individual_id2individual=individual_id2individual) parent2 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[1], \ individual_id2individual=individual_id2individual) parent1Sex = parent1.codeSexInNumber() parent2Sex = parent2.codeSexInNumber() #2013.07.18 one and only genotyped, then add the ungenotyped as a ungenotyped duo if parents[0] not in individual_id2alignment and parents[1] in individual_id2alignment: if parents[0] not in ungenotypedNodeID2Data: ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex) elif parents[0] in individual_id2alignment and parents[1] not in individual_id2alignment: if parents[1] not in ungenotypedNodeID2Data: ungenotypedNodeID2Data[parents[1]] = PassingData(individualDBEntry=parent2, sex=parent2Sex) if parent1Sex==2: #swap the father and mother row tmp = parent1 parent1 = parent2 parent2 = tmp father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) mother_id = self.getProperSampleIDForPlinkOutput(individual=parent2, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) elif len(parents)==1: parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \ individual_id2individual=individual_id2individual) parent1Sex = parent1.codeSexInNumber() if parent1Sex==2: parent2Sex = 1 father_id = 0 mother_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) else: parent2Sex = 2 father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) mother_id = 0 #2013.07.18 parent1 (parents[0]) has to be in individual_id2alignment (genotyped) in order for the other #to qualify as an ungenotype parent in a duo if parents[0] in individual_id2alignment: #if parents[0] not in ungenotypedNodeID2Data: # ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex) fakeParentData = self.generateFakeIndividualID(pedigreeGraph=DG, currentNoOfFakes=currentNoOfFakes) currentNoOfFakes = fakeParentData.currentNoOfFakes fakeParent2ID = fakeParentData.individualID if fakeParent2ID not in individual_id2alignment: if fakeParent2ID not in ungenotypedNodeID2Data: ungenotypedNodeID2Data[fakeParent2ID] = PassingData(individualDBEntry=None, sex=parent2Sex) elif len(parents)==0: father_id = 0 mother_id = 0 else: sys.stderr.write("Error: number of parents (%s) for %s is %s.\n"%(repr(parents), nodeID, len(parents))) sys.exit(3) else: # founders father_id = 0 mother_id = 0 if treatEveryOneIndependent: #force the parents to be 0, everyone becomes founders father_id = 0 mother_id = 0 individual_id = self.getProperSampleIDForPlinkOutput(individual=individual, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) data_row = [family_id, individual_id, father_id, mother_id, \ individual.codeSexInNumber(), 1] writer.writerow(data_row) counter += 1 noOfUngenotypedParentsOutputted = 0 if addUngenotypedDuoParents: for ungenotypedNodeID, pdata in ungenotypedNodeID2Data.iteritems(): individual_id = self.getProperSampleIDForPlinkOutput(individual=pdata.individualDBEntry, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, \ sampleIDFormat=sampleIDFormat, defaultSampleID=ungenotypedNodeID) data_row = [family_id, individual_id, 0, 0, pdata.sex, 1] writer.writerow(data_row) noOfUngenotypedParentsOutputted += 1 sys.stderr.write("%s individuals and %s ungenotyped duo-parents outputted, number of fake parents %s, addUngenotypedDuoParents=%s.\n"%\ (counter, noOfUngenotypedParentsOutputted, currentNoOfFakes, addUngenotypedDuoParents)) del writer
def splitNamVCFIntoMultipleSingleChrVCF(self, inputFname, outputDir, minDepth=1, includeIndels=False, maxContigNumber=1000): """ 2012.5.10 Two things in Nam's VCF file are to be modified. 1. extract VRC UCLAID from its sample ID 2. replace vervet1_scaffolds_Contig137 with simply "Contig137" """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID import re newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: search_result = self.UCLAID_Pattern.search(sampleID) UCLAID = search_result.group('UCLAID') newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader chr2outVCFFile = {} counter = 0 real_counter = 0 for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue contig_id_pattern_result = self.contig_id_pattern.search(vcfRecord.chr) chr = contig_id_pattern_result.group('contigID') if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 vcfRecord.chr = chr pos = vcfRecord.pos if chr not in chr2outVCFFile: outputFname = os.path.join(outputDir, '%s.vcf'%(chr)) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() chr2outVCFFile[chr] = outVCFFile outVCFFile = chr2outVCFFile.get(chr) # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files for chr, outVCFFile in chr2outVCFFile.iteritems(): outVCFFile.close() sys.stderr.write("%s (out of %s) loci from %s chromosomes.\n"%(real_counter, counter, len(chr2outVCFFile)))
def convertAlignmentReadGroup2UCLAIDInVCF(self, inputFname, outputFname, minDepth=1, includeIndels=False,\ maxContigNumber=None): """ 2012.5.10 """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: readGroupData = VervetDB.VervetDB.parseAlignmentReadGroupWithoutDB(sampleID) UCLAID = readGroupData.individual_code newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader counter = 0 real_counter = 0 outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue chr = vcfRecord.chr if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files outVCFFile.close() sys.stderr.write("%s (out of %s) loci.\n"%(real_counter, counter))
def extractSamples(self, db_vervet=None, inputFname=None, outputFname=None, \ tax_id_set=None, site_id_set=None, country_id_set=None, \ min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\ **keywords): """ 2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not) 2013.04.30 added argument min_coverage, max_coverage 2012.10.10 added argument outputFormat. 2012.10.5 """ sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\ (inputFname,\ getattr(site_id_set, '__len__', returnZeroFunc)(),\ getattr(country_id_set, '__len__', returnZeroFunc)(),\ getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\ outputFormat, is_contaminated )) vcfFile = VCFFile(inputFname=inputFname) oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile.sampleStartingColumn] #anything before the samples are same no_of_samples = 0 col_index2sampleID = {} #this structure stores the selected samples and their column index for col_index, individual_name in vcfFile.get_col_index_individual_name_ls(): individualAlignment = db_vervet.parseAlignmentReadGroup(individual_name).individualAlignment if individualAlignment is not None: filteredAlignmentList = db_vervet.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \ max_coverage=max_coverage, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=site_id_set, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \ is_contaminated=is_contaminated, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) if filteredAlignmentList: #non-empty, passed the filter newHeader.append(individual_name) no_of_samples += 1 col_index2sampleID[col_index] = individual_name else: sys.stderr.write("Warning: no individualAlignment for sample %s.\n"%(individual_name)) sys.exit(3) no_of_snps = 0 if outputFormat==1: outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) for vcfRecord in vcfFile: data_row =vcfRecord.row[:vcfFile.sampleStartingColumn] for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength): if i in col_index2sampleID: data_row.append(vcfRecord.row[i]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() elif outputFormat in [2,3]: outf = open(outputFname, 'w') if outputFormat==2: outf.write("sampleID\n") for col_index, sampleID in col_index2sampleID.iteritems(): outf.write("%s\n"%(sampleID)) outf.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))
def replicateVCFGenotypeColumns(self, inputFname, outputFname=None, replicateIndividualTag=None, sampleID2FamilyCount=None,\ minDepth=0): """ 2012.10.5 remove argument sampleStartingColumn 2012.5.10 VCFFile has been changed considerably and can act as a writer now. 2012.3.29 """ sys.stderr.write("Replicating some genotype columns in %s ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs """ outf = open(outputFname, 'w') writer = csv.writer(outf, delimiter='\t') #write all the headers up till the last line (which describes the samples and etc.) for metaInfo in vcfFile.metaInfoLs: outf.write(metaInfo) """ #modify the sample-id header line sampleID2DataIndexLs = {} oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile.sampleStartingColumn] #anything before the samples are same no_of_samples = 0 for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength): #for sample_id in vcfFile.metaInfoLs[-1][vcfFile.sampleStartingColumn:]: sample_id = oldHeader[i].strip() newHeader.append('%s%s%s'%(sample_id, replicateIndividualTag, 1)) #1 because it's the 1st copy no_of_samples += 1 sampleID2DataIndexLs[sample_id] = [i] #1st copy for this sample #add additional column headers based on each one's occurrence extraColIndex2sampleID = {} for sample_id, familyCount in sampleID2FamilyCount.iteritems(): for i in xrange(1, familyCount): #if familyCount>1: if sample_id in sampleID2DataIndexLs: no_of_samples += 1 extraColIndex = len(newHeader) extraColIndex2sampleID[extraColIndex] = sample_id sampleID2DataIndexLs[sample_id].append(extraColIndex) replicate_order = len(sampleID2DataIndexLs[sample_id]) newHeader.append("%s%s%s"%(sample_id, replicateIndividualTag, replicate_order)) outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) no_of_snps = 0 for vcfRecord in vcfFile.parseIter(): data_row =vcfRecord.row #2013.09.13 replace all "./." with full NA formating i.e. "./.:.:.:.", pending fields in the "format" column for i in xrange(vcfRecord.sampleStartingColumn, len(data_row)): if data_row[i]=='./.': #2013.09.15 expand this NA genotype for TrioCaller field_value_ls = [] for format_field in vcfRecord.format_column_ls: if format_field=='GT': field_value_ls.append('./.') elif format_field=='PL': #for TrioCaller field_value_ls.append('.,.,.') else: field_value_ls.append('.') #field_value_ls = ['./.'] + ['.']*(len(vcfRecord.format_column_name2index)-1) data_row[i] = ':'.join(field_value_ls) for i in xrange(oldHeaderLength, newHeaderLength): #add more genotype copies for those extra columns sample_id = extraColIndex2sampleID.get(i) sourceIndex = sampleID2DataIndexLs.get(sample_id)[0] data_row.append(data_row[sourceIndex]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))
def run(self): """ 2011-7-11 """ if self.run_type!=1: self.needSplitChrIntervalData = False #2013.06.21 turn this off before setup_run() to not construct chr2IntervalDataLs else: self.needSplitChrIntervalData = True pdata = self.setup_run() workflow = pdata.workflow db_vervet = self.db if self.run_type in [2,3]: inputData = self.registerAllInputFiles(workflow, self.inputDir, input_site_handler=self.input_site_handler, \ checkEmptyVCFByReading=self.checkEmptyVCFByReading,\ pegasusFolderName=self.pegasusFolderName,\ maxContigID=self.maxContigID, \ minContigID=self.minContigID, db_vervet=db_vervet, \ needToKnowNoOfLoci=abs(1-self.notToKnowNoOfLoci),\ minNoOfLociInVCF=self.minNoOfLociInVCF) #ignore files with too few loci inputF = inputData.jobDataLs[0].vcfFile vcfFile = VCFFile(inputFname=inputF.abspath) alignmentLs = db_vervet.getAlignmentsFromVCFSampleIDList(vcfFile.getSampleIDList()) del vcfFile cumulativeMedianDepth = db_vervet.getCumulativeAlignmentMedianDepth(alignmentLs=pdata.alignmentLs, \ defaultSampleAlignmentDepth=self.defaultSampleAlignmentDepth) registerReferenceData = pdata.registerReferenceData if self.run_type==1: #chr2size = set(['Contig149']) #temporary when testing Contig149 #chr2size = set(['1MbBAC']) #temporary when testing the 1Mb-BAC (formerly vervet_path2) #2012.6.12 #self.outputAlignmentDepthAndOthersForFilter(db_vervet=db_vervet, outputFname=self.alnStatForFilterFname, \ # ref_ind_seq_id=self.ref_ind_seq_id, \ # foldChange=self.depthFoldChange, minGQ=30) #minGQ doesn't matter anymore. self.addGenotypeCallJobs(workflow=workflow, alignmentDataLs=pdata.alignmentDataLs, chr2IntervalDataLs=self.chr2IntervalDataLs, \ registerReferenceData=registerReferenceData, \ site_handler=self.site_handler, input_site_handler=self.input_site_handler,\ needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \ intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \ site_type=self.site_type, data_dir=self.data_dir,\ outputDirPrefix="",\ genotypeCallerType=self.genotypeCallerType,\ cumulativeMedianDepth=cumulativeMedianDepth,\ transferOutput=True) elif self.run_type in [2, 3]: self.addTrioCallerJobsONVCFFiles(workflow=workflow, alignmentLs=alignmentLs, inputData=inputData, \ samtools=workflow.samtools, \ genotyperJava=workflow.genotyperJava, SelectVariantsJava=workflow.SelectVariantsJava, \ GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \ addOrReplaceReadGroupsJava=workflow.addOrReplaceReadGroupsJava, AddOrReplaceReadGroupsJar=workflow.AddOrReplaceReadGroupsJar, \ CreateSequenceDictionaryJava=workflow.CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=workflow.CreateSequenceDictionaryJar, \ MergeSamFilesJar=workflow.MergeSamFilesJar, \ BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \ mv=workflow.mv, CallVariantBySamtools=workflow.CallVariantBySamtools, \ trioCallerPath=self.trioCallerPath, trioCallerWrapper=workflow.trioCallerWrapper, \ replicateIndividualTag=self.replicateIndividualTag, treatEveryOneIndependent=self.treatEveryOneIndependent,\ bgzip_tabix=workflow.bgzip_tabix, vcf_convert=workflow.vcf_convert, \ vcf_isec=workflow.vcf_isec, vcf_concat=workflow.vcf_concat, \ concatGATK=workflow.concatGATK, concatSamtools=workflow.concatSamtools,\ ligateVcf=self.ligateVcf, ligateVcfExecutableFile=self.ligateVcfExecutableFile,\ registerReferenceData=registerReferenceData, \ namespace=workflow.namespace, version=workflow.version, site_handler=self.site_handler, input_site_handler=self.input_site_handler,\ needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \ outputDirPrefix="", \ intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \ site_type=self.site_type, data_dir=self.data_dir,\ onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP, maxSNPMissingRate=self.maxSNPMissingRate,\ alnStatForFilterF=None, cumulativeMedianDepth=cumulativeMedianDepth,\ run_type=self.run_type, transferOutput=True) self.end_run()
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir realPath = os.path.realpath(self.inputFname) logMessage = "file %s.\n"%(self.inputFname) if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \ not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading): vcfFile = VCFFile(inputFname=self.inputFname) individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile) genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \ individualAlignmentLs=individualAlignmentLs,\ no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\ data_dir=self.data_dir) self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session) pdata = self.getNoOfLociFromVCFFile(vcfFile) chromosome2noOfLoci = pdata.chromosome2noOfLoci no_of_loci = pdata.no_of_loci if no_of_loci>0: #file with zero loci could have identical md5sum try: md5sum = utils.get_md5sum(realPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) else: md5sum = None """ db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first() if db_entry: sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\ (db_entry.path, md5sum, realPath)) session.rollback() #2012.8.3 when the jobs are clustered into one merged job and it failed halfway # and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. self.cleanUpAndExitOnFailure(exitCode=0) """ no_of_individuals = len(individualAlignmentLs) no_of_chromosomes = len(chromosome2noOfLoci) if no_of_chromosomes == 1: #2012.8.30 use 1st chromosome chromosome = chromosome2noOfLoci.keys()[0] else: chromosome = None genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\ chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\ original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\ data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes) if genotypeFile.id and genotypeFile.path: isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir) if isPathInDB==-1: sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path)) self.cleanUpAndExitOnFailure(exitCode=isPathInDB) elif isPathInDB==1: #successful exit, entry already in db sys.stderr.write("Warning: file %s is already in db.\n"%\ (genotypeFile.path)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=0) else: #not in db affiliated storage, keep going. pass #move the file and update the db_entry's path as well inputFileBasename = os.path.basename(self.inputFname) relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename) exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \ inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=genotypeFile.constructRelativePath) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=exitCode) #copy the tbi (tabix) index file if it exists tbiFilename = '%s.tbi'%(realPath) if os.path.isfile(tbiFilename): srcFilename = tbiFilename dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path)) utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename) ## 2012.7.17 commented out because md5sum is calcualted above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) # #2012.7.17 record the size of db_entry.path (folder or file) self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir) vcfFile.close() logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum) else: logMessage += " is empty (no loci) or not VCF file.\n" self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: session.rollback() #delete all target files but exit gracefully (exit 0) self.cleanUpAndExitOnFailure(exitCode=0)