def reduceEachInput(self, chromosome=None, passingData=None, mapEachIntervalDataLs=None, transferOutput=True, **keywords): """ 2013.07.10 #. concatenate all the sub-Inputs into one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs #intervalJobLs = [pdata for pdata in mapEachIntervalDataLs] """ realInputVolume = passingData.jobData.file.noOfIndividuals * \ passingData.jobData.file.noOfLoci baseInputVolume = 200*20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value """ return returnData
def getFamilyStructure(self): """ 2013.07.19 """ sys.stderr.write("Finding unique pairs (singletons or groups) of parents ...\n ") noOfParents2FamilyData = {} for nodeID in self.pedigreeGraph: parents = self.pedigreeGraph.predecessors(nodeID) noOfParents = len(parents) if noOfParents not in noOfParents2FamilyData: noOfParents2FamilyData[noOfParents] = PassingData( parentTupleSet=set(), parentIDSet=set(), childIDSet=set(),\ individualIDSet=set()) parents.sort() noOfParents2FamilyData[noOfParents].parentTupleSet.add(tuple(parents)) for parentID in parents: noOfParents2FamilyData[noOfParents].parentIDSet.add(parentID) noOfParents2FamilyData[noOfParents].individualIDSet.add(parentID) noOfParents2FamilyData[noOfParents].childIDSet.add(nodeID) noOfParents2FamilyData[noOfParents].individualIDSet.add(nodeID) noOfNuclearFamilies = noOfParents2FamilyData.get(2, 0) self._reportFamilyStructure(noOfParents2FamilyData) return PassingData(noOfParents2FamilyData=noOfParents2FamilyData)
def run(self): """ 2011-7-11 """ self.setup_run() inputData = PassingData(jobDataLs = []) inputFile = self.registerOneInputFile(self.inputFname, folderName=self.pegasusFolderName) inputData.jobDataLs.append(PassingData(output=inputFile, jobLs=[])) noOfTotalSequences= self.getNoOfSequencesFromFasta(inputFastaFname=self.inputFname) registerReferenceData = self.registerBlastNucleotideDatabaseFile( ntDatabaseFname=self.databaseFname, \ input_site_handler=self.input_site_handler) ntDatabaseFileList = registerReferenceData.refFastaFList ntDatabaseFile = ntDatabaseFileList[0] if len(ntDatabaseFileList)<4: #some nt-database index file is missing sys.stderr.write("Adding blast-db-making job...") makeBlastDBJob = self.addMakeBlastDBJob(executable=self.formatdb,\ inputFile=ntDatabaseFile, transferOutput=True) #add the index files to the ntDatabaseFileList ntDatabaseFileList = [ntDatabaseFile] + makeBlastDBJob.outputList sys.stderr.write(".\n") else: makeBlastDBJob = None self.addJobs(inputData=inputData, outputDirPrefix=self.pegasusFolderName, ntDatabaseFileList=ntDatabaseFileList, \ noOfTotalSequences=noOfTotalSequences, \ transferOutput=True, makeBlastDBJob=makeBlastDBJob) self.end_run()
def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \ transferOutput=True): """ 2012.6.27 """ sys.stderr.write("Adding wget jobs for %s input ... " % (len(relativePathList))) no_of_jobs = 0 topOutputDir = outputDir topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 returnData = PassingData() returnData.jobDataLs = [] for relativePath in relativePathList: #2013.06.26 remove all "/" from relativePath in case it's a folder relativePathNoFolder = relativePath.replace('/', '_') logFile = File('%s.log' % (relativePathNoFolder)) wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \ username=username, password=password,\ targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \ transferOutput=transferOutput, \ extraArguments=None, job_max_memory=50) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \ fileLs=wgetJob.outputLs)) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnData
def linkMapToReduce(self, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduceEachChromosome(self, chromosome=None, passingData=None, mapEachInputDataLs=None, chromosome2mapEachIntervalDataLs=None,\ reduceEachInputDataLs=None,\ transferOutput=True, \ **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachInputDataLs = mapEachInputDataLs returnData.reduceEachInputDataLs = reduceEachInputDataLs #reduce matrix by chosen column and average p-value outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output, 'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome))) reduceChromosomeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=outputFile, \ parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \ extraDependentInputLs=None, transferOutput=False) #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\ mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome) for mapEachIntervalData in mapEachIntervalDataLs: for jobData in mapEachIntervalData.jobDataLs: self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job]) #add the reduction job to final stat merge job self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob]) return returnData
def mapEachAlignment(self, alignmentData=None, passingData=None, transferOutput=True, **keywords): """ 2012.9.22 similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each. passingData.alignmentJobAndOutputLs = [] passingData.bamFnamePrefix = bamFnamePrefix passingData.individual_alignment = alignment """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob refFastaF = passingData.refFastaFList[0] alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = alignment.getReadGroup() return returnData
def preReduce(self, passingData=None, transferOutput=True, **keywords): """ setup additional mkdir folder jobs, before mapEachAlignment, mapEachChromosome, mapReduceOneAlignment """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def mapEachChromosome(self, alignmentData=None, chromosome=None,\ VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def map(self, alignmentData=None, intervalData=None,\ VCFJobData=None, passingData=None, mapEachChromosomeData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduceAfterEachChromosome(self, chromosome=None, passingData=None, transferOutput=True, mapEachIntervalDataLs=None, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs return returnData
def reduce(self, passingData=None, reduceAfterEachAlignmentDataLs=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs return returnData
def reduceBeforeEachAlignment(self, passingData=None, transferOutput=True, **keywords): """ 2012.9 setup some reduce jobs before loop over all intervals of one alignment begins. these reduce jobs will collect stuff from each map() job. the link will be established in linkMapToReduce(). """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduceAfterEachAlignment(self, passingData=None, mapEachChromosomeDataLs=None, reduceAfterEachChromosomeDataLs=None,\ transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs return returnData
def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, \ chromosome=None,intervalData=None,\ mapEachChromosomeData=None, \ passingData=None, transferOutput=False, **keywords): """ #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and # no of mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new Input file based on the input split Input file (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] passingData.intervalFileBasenamePrefix passingData.splitInputFile """ ## 2013.06.19 structures available from passingData, specific to the interval passingData.splitInputFile = splitInputFile passingData.unitNumber = unitNumber passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%( chromosome, commonPrefix, unitNumber) passingData.noOfIndividuals = jobData.file.noOfIndividuals passingData.span = self.intervalSize + self.intervalOverlapSize*2 """ #add one computing job outputFile = File(os.path.join(self.mapDirJob.output, "%s.%s.probability.tsv.gz"%(passingData.fileBasenamePrefix,\ intervalData.interval))) locusIntervalDeltaOutputFile = File(os.path.join(self.mapDirJob.output, "%s.%s.locusIntervalDelta.tsv.gz"%(passingData.fileBasenamePrefix, intervalData.interval))) job = self.addAbstractMatrixFileWalkerJob( executable=self.ComputeLiftOverLocusProbability, \ inputFile=selectIntervalJobData.file, outputFile=outputFile, \ whichColumn=None, whichColumnHeader=None, \ logY=None, valueForNonPositiveYValue=-1, \ minNoOfTotal=1, samplingRate=1, \ inputFileFormat=None, outputFileFormat=None,\ extraArgumentList=["--locusIntervalDeltaOutputFname", locusIntervalDeltaOutputFile, \ "--startPosition %s"%(intervalData.start), "--stopPosition %s"%(intervalData.stop)], parentJobLs=[selectIntervalJobData.job], extraOutputLs=[locusIntervalDeltaOutputFile],\ transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=False) #For each interval, probabilities are not calculated for loci in # extra segment (from overlapStart to start). returnData.jobDataLs.append(self.constructJobDataFromJob(job)) return returnData
def mapEachAlignment(self, passingData=None, transferOutput=True, **keywords): """ 2012.9.22 similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each. """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] return returnData
def mapEachChromosome(self, alignmentData=None, chromosome=None,\ VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = passingData.bamFnamePrefix """ #2012.9.21 perhaps a downsampling job outputFname = os.path.join(topOutputDirJob.output, \ '%s_%s.bam'%(bamFnamePrefix, overlapFileBasenameSignature)) outputFile = File(outputFname) selectAlignmentJob, bamIndexJob1 = self.addSelectAlignmentJob( executable=self.samtools, inputFile=bamF, \ outputFile=outputFile, region=overlapInterval, parentJobLs=[topOutputDirJob] + parentJobLs, \ extraDependentInputLs=[baiF], transferOutput=False, \ extraArguments=None, job_max_memory=2000, needBAMIndexJob=True) """ """ #2012.9.21 count covariates job is moved to map() recalFile = File(os.path.join(topOutputDirJob.output, '%s_%s.recal_data.csv'%(bamFnamePrefix, chromosome))) countCovariatesJob = self.addGATKBaseRecalibratorJob( GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, inputFile=bamF, \ VCFFile=VCFFile, interval=chromosome, outputFile=recalFile, \ refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, extraDependentInputLs=[baiF, VCFFile.tbi_F], \ transferOutput=False, \ extraArguments=None, job_max_memory=4000) self.no_of_jobs += 1 returnData.countCovariatesJob = countCovariatesJob returnData.jobDataLs.append(PassingData(jobLs=[countCovariatesJob], file=countCovariatesJob.recalFile, \ fileLs=[countCovariatesJob.recalFile])) """ return returnData
def parseQueryLocusID(self, locus_id=None): """ 2012.10.8 locus_id is in the format of '%s_%s_%s_positionInFlank%s'%(chromosome, start, stop, flankingLength+1) output of ExtractFlankingSequenceForVCFLoci.py """ search_result = ExtractFlankingSequenceForVCFLoci.sequenceTitlePattern.search( locus_id) chromosome = None start = None stop = None refBase = None altBase = None positionInFlank = None if search_result: chromosome = search_result.group(1) start = int(search_result.group(2)) stop = int(search_result.group(3)) refBase = search_result.group(4) altBase = search_result.group(5) positionInFlank = int(search_result.group(6)) return PassingData(chromosome=chromosome, start=start, stop=stop, refBase=refBase, altBase=altBase, positionInFlank=positionInFlank)
def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9): """ 2013.12.04 """ totalNoOfReads = 0 noOfGoodReads = 0.0 medianMapQ=-10 mapQList=[] for alignedRead in alignedReadLs: totalNoOfReads +=1 mapQList.append(alignedRead.mapq) if alignedRead.mapq>=minMapQGoodRead: noOfGoodReads += 1 else: pass if totalNoOfReads>0: fractionOfGoodRead = noOfGoodReads/(totalNoOfReads) medianMapQ = numpy.median(mapQList) else: fractionOfGoodRead = -1 medianMapQ = -10 if fractionOfGoodRead>=minFractionOfGoodRead: locusLowMapQIndicator = 0 else: locusLowMapQIndicator = 2 return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \ noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\ medianMapQ=medianMapQ)
def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None): """ possible header lines: >gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence >gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence >gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome >gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1 """ #discard '>' and '\n' header = descriptionLine[1:-1] header = header.split('|') _tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4]) if self.p_chromosome.search(header[4]) is not None: chromosome = self.p_chromosome.search(header[4]).groups()[0] elif header[4].find('mitochondrion')!=-1: chromosome = 'mitochondrion' elif header[4].find('chloroplast')!=-1: chromosome = 'chloroplast' else: #something else, take the whole before ',' chromosome = header[4].split(',')[0] gi = int(header[1]) acc_ver = header[3] comment = header[4] return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def parseFastaDescriptionForGenebank_hs37d5(self, descriptionLine=None, FigureOutTaxID_ins=None): """ >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 >Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1 >MT gi|251831106|ref|NC_012920.1| H**o sapiens mitochondrion, complete genome >GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1 >GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1 >NC_007605 >hs37d5 """ header = descriptionLine[1:-1] headerList = header.split() chromosome = headerList[0] comment = ' '.join(headerList[1:]) gi = None acc_ver = None accitem = re.compile(r'supercontig') if accitem.search(header) is not None: acc_ver = headerList[0] else: commentSplit = comment.split("|") if(len(commentSplit) > 4): #deal with MT gi = int(commentSplit[1]) acc_ver = commentSplit[3] comment = commentSplit[4] return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None, FigureOutTaxID_ins=None): """ 2011-7-6 """ """ possible header lines: >Contig0 12652774 13406928 """ header = descriptionLine[1:-1] #discard '>' and '\n' header = header.split() chromosome = header[0] #contig name is taken as chromosome """ p_chromosome = re.compile(r'Contig(\d+)') if p_chromosome.search(header[0]) is not None: chromosome = p_chromosome.search(header[0]).groups()[0] else: chromosome = None """ gi = None acc_ver = None comment = None return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None): """ 2011-7-6 possible header lines: >gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces >gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence >gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence """ header = descriptionLine[1:-1] #discard '>' and '\n' header = header.split('|') _tax_id = None # 1st type of clone description p_chromosome = re.compile(r'UNK clone ([^,]+),') # 2nd type of clone description p2_chromosome = re.compile(r'clone ([^,]+),') if p_chromosome.search(header[4]) is not None: chromosome = p_chromosome.search(header[4]).groups()[0] else: if p2_chromosome.search(header[4]) is not None: chromosome = p2_chromosome.search(header[4]).groups()[0] else: chromosome = None gi = int(header[1]) acc_ver = header[3] comment = header[4] return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def estimateMeanStdFromData(dataVector=None, excludeTopFraction=0.2): """ 2012.10.14 adapted from vervet/src/pedigree/DetectWrongLabelByCompKinshipVsIBD.DetectWrongLabelByCompKinshipVsIBD.estimateAbsDeltaMeanStd() 2012.8.22 """ sys.stderr.write("Estimating mean&std using the middle %.1f%% of data (n=%s) ..."%\ ((1-excludeTopFraction)*100, len(dataVector))) noOfRows = len(dataVector) import numpy # 2012.8.22 draw some histogram to check what data looks like # if len(dataVector)>10: # outputFname = '%s_kinship_ibd_hist.png'%(self.outputFnamePrefix) # yh_matplotlib.drawHist(dataVector, title='', \ # xlabel_1D="kinship-ibd", xticks=None, \ # outputFname=outputFname, min_no_of_data_points=10, \ # needLog=True, \ # dpi=200, min_no_of_bins=25) #dataVector = map(abs, dataVector) #2012.8.23 no abs dataVector.sort() startIndex = min(0, int(len(dataVector) * (excludeTopFraction / 2)) - 1) stopIndex = int(len(dataVector) * (1 - excludeTopFraction / 2)) dataVector = dataVector[startIndex:stopIndex] data_mean = numpy.mean(dataVector) data_std = numpy.std(dataVector) sys.stderr.write(" mean=%.3f, std=%.3f.\n" % (data_mean, data_std)) return PassingData(mean=data_mean, std=data_std)
def readThroughAndProvideSummary(self): """ 2013.08.30 called by vervet/src/db/import/AddAlignmentDepthIntervalFile2DB.py """ col_name2index= self.smartReadHeader() if col_name2index is None: pdata = self.parseRow(self._row) self._postProcessParsedRowDataForSummary(pdata) for row in self: pdata = self.parseRow(row) self._postProcessParsedRowDataForSummary(pdata) self.min_interval_length = numpy.min(self.interval_length_ls) self.max_interval_length = numpy.max(self.interval_length_ls) self.median_interval_length = numpy.median(self.interval_length_ls) self.mean_interval_value=numpy.mean(self.interval_value_ls) self.median_interval_value=numpy.median(self.interval_value_ls) return PassingData( no_of_intervals=self.no_of_intervals, chromosome_size=self.chromosome_size, mean_interval_value=self.mean_interval_value, median_interval_value=self.median_interval_value, min_interval_value=self.min_interval_value, max_interval_value=self.max_interval_value, min_interval_length=self.min_interval_length, max_interval_length=self.max_interval_length, median_interval_length=self.median_interval_length)
def openWriteBeagleFiles(self, pedigreeFamilyData=None, outputFnamePrefix=None): """ 2013.05.02 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M rs12082861 C C C C M rs4912233 T C C C M rs12732823 G A A A M rs17451521 C C C C M rs12033358 C T T T The likelihood version is marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524 Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000 Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000 Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000 The markers file has this format (markerID, position, alleleA, alleleB) Contig791:1086 1086 C A """ sys.stderr.write( "Opening beagle files (outputFnamePrefix =%s) to write ..." % (outputFnamePrefix)) familySize2BeagleFileHandler = {} familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList counter = 0 for familySize, sampleIDList in familySize2SampleIDList.items(): if familySize not in familySize2BeagleFileHandler: tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix, familySize) writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix), mode='w', delimiter=' ') familySize2BeagleFileHandler[familySize] = writer if familySize == 1: headerRow = ['marker', 'alleleA', 'alleleB'] else: headerRow = ['I', 'id'] for sampleID in sampleIDList: if familySize == 1: #likelihood format has sample name replicated three times, rather than 2 times headerRow.extend([sampleID] * 3) else: headerRow.extend([sampleID] * 2) writer.writeHeader(headerRow) counter += 1 markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix), mode='w', delimiter=' ') counter += 1 sys.stderr.write("%s files outputted.\n" % (counter)) return PassingData( familySize2BeagleFileHandler=familySize2BeagleFileHandler, markersFile=markersFile)
def parseInputFile(self, inputFname=None, **keywords): """ 2013.08.23 if a program is adding a file to db-affiliated storage, this is used for parsing. """ return PassingData()
def run(self): """ 11-13-05 --db_connect() --parse_entrezgene_xml_file() --is_gi_valid_in_annot_assembly_table() --find_info_dict() --return_location_list() --submit_to_entrezgene_mapping_table() """ if self.debug: import pdb pdb.set_trace() sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.inputfiles)) db = GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) #2010-6-22 session = db.session param_obj = PassingData(session=db.session, no_of_genes_already_in_db=0, no_of_entrezgene_mappings_already_in_db=0,\ no_of_total=0, no_of_into_db=0, report=self.report, no_of_commentaries_already_in_db=0,\ no_of_gene_segments_already_in_db=0, no_of_gene2go_already_in_db=0) for f in self.inputfiles: sys.stderr.write("%d/%d:\t%s\n"%(self.inputfiles.index(f)+1,len(self.inputfiles),f)) self.parse_xml_file(session, f, tax_id=self.tax_id, param_obj=param_obj) session.flush() if self.commit: session.commit() else: session.rollback()
def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]): """ 1. take mean/median/stdev of every cell in dataLs, 2. modify newHeader to reflect that """ print(f"Averaging key2dataLs ({len(key2dataLs)} entries ) ...", flush=True) newKey2DataLs = {} newHeader = [] keyColHeader = header[:no_of_key_columns] valueColHeader = header[no_of_key_columns:] newValueColHeader = [] no_of_value_columns = len(valueColHeader) for i in range(no_of_value_columns): valueColName = valueColHeader[i] newValueColHeader += [ 'mean_%s' % (valueColName), 'median_%s' % (valueColName), 'stdev_%s' % (valueColName) ] for key, dataLs in key2dataLs.items(): if key not in newKey2DataLs: newKey2DataLs[key] = [] no_of_value_columns = len(dataLs) for i in range(no_of_value_columns): meanValue = numpy.mean(dataLs[i]) medianValue = numpy.median(dataLs[i]) stdev = numpy.std(dataLs[i]) newKey2DataLs[key] += [meanValue, medianValue, stdev] print(f"Done.", flush=True) return PassingData(key2dataLs=newKey2DataLs, header=keyColHeader + newValueColHeader)
def reduce(self, passingData=None, reduceEachChromosomeDataLs=None, transferOutput=True, **keywords): """ #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] reduceOutputDirJob = passingData.reduceOutputDirJob realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci baseInputVolume = 200 * 20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value outputFile = File( os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv')) reduceJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, outputF=outputFile, parentJobLs=[reduceOutputDirJob], transferOutput=transferOutput, ) returnData.jobDataLs.append( PassingData(jobLs=[reduceJob], file=reduceJob.output, fileLs=[reduceJob.output])) for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs: for mapEachIntervalData in mapEachIntervalDataLs: self.addInputToMergeJob(reduceJob, \ parentJobLs=[mapEachIntervalData.mapJob]) return returnData