def getFamilyStructure(self): """ 2013.07.19 """ sys.stderr.write("Finding unique pairs (singletons or groups) of parents ...\n ") noOfParents2FamilyData = {} for nodeID in self.pedigreeGraph: parents = self.pedigreeGraph.predecessors(nodeID) noOfParents = len(parents) if noOfParents not in noOfParents2FamilyData: noOfParents2FamilyData[noOfParents] = PassingData( parentTupleSet=set(), parentIDSet=set(), childIDSet=set(),\ individualIDSet=set()) parents.sort() noOfParents2FamilyData[noOfParents].parentTupleSet.add(tuple(parents)) for parentID in parents: noOfParents2FamilyData[noOfParents].parentIDSet.add(parentID) noOfParents2FamilyData[noOfParents].individualIDSet.add(parentID) noOfParents2FamilyData[noOfParents].childIDSet.add(nodeID) noOfParents2FamilyData[noOfParents].individualIDSet.add(nodeID) noOfNuclearFamilies = noOfParents2FamilyData.get(2, 0) self._reportFamilyStructure(noOfParents2FamilyData) return PassingData(noOfParents2FamilyData=noOfParents2FamilyData)
def run(self): """ 2011-7-11 """ self.setup_run() inputData = PassingData(jobDataLs = []) inputFile = self.registerOneInputFile(self.inputFname, folderName=self.pegasusFolderName) inputData.jobDataLs.append(PassingData(output=inputFile, jobLs=[])) noOfTotalSequences= self.getNoOfSequencesFromFasta(inputFastaFname=self.inputFname) registerReferenceData = self.registerBlastNucleotideDatabaseFile( ntDatabaseFname=self.databaseFname, \ input_site_handler=self.input_site_handler) ntDatabaseFileList = registerReferenceData.refFastaFList ntDatabaseFile = ntDatabaseFileList[0] if len(ntDatabaseFileList)<4: #some nt-database index file is missing sys.stderr.write("Adding blast-db-making job...") makeBlastDBJob = self.addMakeBlastDBJob(executable=self.formatdb,\ inputFile=ntDatabaseFile, transferOutput=True) #add the index files to the ntDatabaseFileList ntDatabaseFileList = [ntDatabaseFile] + makeBlastDBJob.outputList sys.stderr.write(".\n") else: makeBlastDBJob = None self.addJobs(inputData=inputData, outputDirPrefix=self.pegasusFolderName, ntDatabaseFileList=ntDatabaseFileList, \ noOfTotalSequences=noOfTotalSequences, \ transferOutput=True, makeBlastDBJob=makeBlastDBJob) self.end_run()
def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \ transferOutput=True): """ 2012.6.27 """ sys.stderr.write("Adding wget jobs for %s input ... " % (len(relativePathList))) no_of_jobs = 0 topOutputDir = outputDir topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 returnData = PassingData() returnData.jobDataLs = [] for relativePath in relativePathList: #2013.06.26 remove all "/" from relativePath in case it's a folder relativePathNoFolder = relativePath.replace('/', '_') logFile = File('%s.log' % (relativePathNoFolder)) wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \ username=username, password=password,\ targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \ transferOutput=transferOutput, \ extraArguments=None, job_max_memory=50) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \ fileLs=wgetJob.outputLs)) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnData
def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9): """ 2013.12.04 """ totalNoOfReads = 0 noOfGoodReads = 0.0 medianMapQ=-10 mapQList=[] for alignedRead in alignedReadLs: totalNoOfReads +=1 mapQList.append(alignedRead.mapq) if alignedRead.mapq>=minMapQGoodRead: noOfGoodReads += 1 else: pass if totalNoOfReads>0: fractionOfGoodRead = noOfGoodReads/(totalNoOfReads) medianMapQ = numpy.median(mapQList) else: fractionOfGoodRead = -1 medianMapQ = -10 if fractionOfGoodRead>=minFractionOfGoodRead: locusLowMapQIndicator = 0 else: locusLowMapQIndicator = 2 return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \ noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\ medianMapQ=medianMapQ)
def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]): """ 1. take mean/median/stdev of every cell in dataLs, 2. modify newHeader to reflect that """ print(f"Averaging key2dataLs ({len(key2dataLs)} entries ) ...", flush=True) newKey2DataLs = {} newHeader = [] keyColHeader = header[:no_of_key_columns] valueColHeader = header[no_of_key_columns:] newValueColHeader = [] no_of_value_columns = len(valueColHeader) for i in range(no_of_value_columns): valueColName = valueColHeader[i] newValueColHeader += [ 'mean_%s' % (valueColName), 'median_%s' % (valueColName), 'stdev_%s' % (valueColName) ] for key, dataLs in key2dataLs.items(): if key not in newKey2DataLs: newKey2DataLs[key] = [] no_of_value_columns = len(dataLs) for i in range(no_of_value_columns): meanValue = numpy.mean(dataLs[i]) medianValue = numpy.median(dataLs[i]) stdev = numpy.std(dataLs[i]) newKey2DataLs[key] += [meanValue, medianValue, stdev] print(f"Done.", flush=True) return PassingData(key2dataLs=newKey2DataLs, header=keyColHeader + newValueColHeader)
def run(self): """ 11-13-05 --db_connect() --parse_entrezgene_xml_file() --is_gi_valid_in_annot_assembly_table() --find_info_dict() --return_location_list() --submit_to_entrezgene_mapping_table() """ if self.debug: import pdb pdb.set_trace() sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.inputfiles)) db = GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) #2010-6-22 session = db.session param_obj = PassingData(session=db.session, no_of_genes_already_in_db=0, no_of_entrezgene_mappings_already_in_db=0,\ no_of_total=0, no_of_into_db=0, report=self.report, no_of_commentaries_already_in_db=0,\ no_of_gene_segments_already_in_db=0, no_of_gene2go_already_in_db=0) for f in self.inputfiles: sys.stderr.write("%d/%d:\t%s\n"%(self.inputfiles.index(f)+1,len(self.inputfiles),f)) self.parse_xml_file(session, f, tax_id=self.tax_id, param_obj=param_obj) session.flush() if self.commit: session.commit() else: session.rollback()
def linkMapToReduce(self, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def parseInputFile(self, inputFname=None, **keywords): """ 2013.08.23 if a program is adding a file to db-affiliated storage, this is used for parsing. """ return PassingData()
def reduceEachInput(self, chromosome=None, passingData=None, mapEachIntervalDataLs=None, transferOutput=True, **keywords): """ 2013.07.10 #. concatenate all the sub-Inputs into one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs #intervalJobLs = [pdata for pdata in mapEachIntervalDataLs] """ realInputVolume = passingData.jobData.file.noOfIndividuals * \ passingData.jobData.file.noOfLoci baseInputVolume = 200*20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value """ return returnData
def readThroughAndProvideSummary(self): """ 2013.08.30 called by vervet/src/db/import/AddAlignmentDepthIntervalFile2DB.py """ col_name2index= self.smartReadHeader() if col_name2index is None: pdata = self.parseRow(self._row) self._postProcessParsedRowDataForSummary(pdata) for row in self: pdata = self.parseRow(row) self._postProcessParsedRowDataForSummary(pdata) self.min_interval_length = numpy.min(self.interval_length_ls) self.max_interval_length = numpy.max(self.interval_length_ls) self.median_interval_length = numpy.median(self.interval_length_ls) self.mean_interval_value=numpy.mean(self.interval_value_ls) self.median_interval_value=numpy.median(self.interval_value_ls) return PassingData( no_of_intervals=self.no_of_intervals, chromosome_size=self.chromosome_size, mean_interval_value=self.mean_interval_value, median_interval_value=self.median_interval_value, min_interval_value=self.min_interval_value, max_interval_value=self.max_interval_value, min_interval_length=self.min_interval_length, max_interval_length=self.max_interval_length, median_interval_length=self.median_interval_length)
def mapEachAlignment(self, alignmentData=None, passingData=None, transferOutput=True, **keywords): """ 2012.9.22 similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each. passingData.alignmentJobAndOutputLs = [] passingData.bamFnamePrefix = bamFnamePrefix passingData.individual_alignment = alignment """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob refFastaF = passingData.refFastaFList[0] alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = alignment.getReadGroup() return returnData
def estimateMeanStdFromData(dataVector=None, excludeTopFraction=0.2): """ 2012.10.14 adapted from vervet/src/pedigree/DetectWrongLabelByCompKinshipVsIBD.DetectWrongLabelByCompKinshipVsIBD.estimateAbsDeltaMeanStd() 2012.8.22 """ sys.stderr.write("Estimating mean&std using the middle %.1f%% of data (n=%s) ..."%\ ((1-excludeTopFraction)*100, len(dataVector))) noOfRows = len(dataVector) import numpy # 2012.8.22 draw some histogram to check what data looks like # if len(dataVector)>10: # outputFname = '%s_kinship_ibd_hist.png'%(self.outputFnamePrefix) # yh_matplotlib.drawHist(dataVector, title='', \ # xlabel_1D="kinship-ibd", xticks=None, \ # outputFname=outputFname, min_no_of_data_points=10, \ # needLog=True, \ # dpi=200, min_no_of_bins=25) #dataVector = map(abs, dataVector) #2012.8.23 no abs dataVector.sort() startIndex = min(0, int(len(dataVector) * (excludeTopFraction / 2)) - 1) stopIndex = int(len(dataVector) * (1 - excludeTopFraction / 2)) dataVector = dataVector[startIndex:stopIndex] data_mean = numpy.mean(dataVector) data_std = numpy.std(dataVector) sys.stderr.write(" mean=%.3f, std=%.3f.\n" % (data_mean, data_std)) return PassingData(mean=data_mean, std=data_std)
def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None): """ 2011-7-6 possible header lines: >gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces >gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence >gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence """ header = descriptionLine[1:-1] #discard '>' and '\n' header = header.split('|') _tax_id = None # 1st type of clone description p_chromosome = re.compile(r'UNK clone ([^,]+),') # 2nd type of clone description p2_chromosome = re.compile(r'clone ([^,]+),') if p_chromosome.search(header[4]) is not None: chromosome = p_chromosome.search(header[4]).groups()[0] else: if p2_chromosome.search(header[4]) is not None: chromosome = p2_chromosome.search(header[4]).groups()[0] else: chromosome = None gi = int(header[1]) acc_ver = header[3] comment = header[4] return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None, FigureOutTaxID_ins=None): """ 2011-7-6 """ """ possible header lines: >Contig0 12652774 13406928 """ header = descriptionLine[1:-1] #discard '>' and '\n' header = header.split() chromosome = header[0] #contig name is taken as chromosome """ p_chromosome = re.compile(r'Contig(\d+)') if p_chromosome.search(header[0]) is not None: chromosome = p_chromosome.search(header[0]).groups()[0] else: chromosome = None """ gi = None acc_ver = None comment = None return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def parseFastaDescriptionForGenebank_hs37d5(self, descriptionLine=None, FigureOutTaxID_ins=None): """ >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 >Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1 >MT gi|251831106|ref|NC_012920.1| H**o sapiens mitochondrion, complete genome >GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1 >GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1 >NC_007605 >hs37d5 """ header = descriptionLine[1:-1] headerList = header.split() chromosome = headerList[0] comment = ' '.join(headerList[1:]) gi = None acc_ver = None accitem = re.compile(r'supercontig') if accitem.search(header) is not None: acc_ver = headerList[0] else: commentSplit = comment.split("|") if(len(commentSplit) > 4): #deal with MT gi = int(commentSplit[1]) acc_ver = commentSplit[3] comment = commentSplit[4] return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None): """ possible header lines: >gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence >gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence >gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome >gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1 """ #discard '>' and '\n' header = descriptionLine[1:-1] header = header.split('|') _tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4]) if self.p_chromosome.search(header[4]) is not None: chromosome = self.p_chromosome.search(header[4]).groups()[0] elif header[4].find('mitochondrion')!=-1: chromosome = 'mitochondrion' elif header[4].find('chloroplast')!=-1: chromosome = 'chloroplast' else: #something else, take the whole before ',' chromosome = header[4].split(',')[0] gi = int(header[1]) acc_ver = header[3] comment = header[4] return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def reduceEachChromosome(self, chromosome=None, passingData=None, mapEachInputDataLs=None, chromosome2mapEachIntervalDataLs=None,\ reduceEachInputDataLs=None,\ transferOutput=True, \ **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachInputDataLs = mapEachInputDataLs returnData.reduceEachInputDataLs = reduceEachInputDataLs #reduce matrix by chosen column and average p-value outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output, 'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome))) reduceChromosomeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=outputFile, \ parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \ extraDependentInputLs=None, transferOutput=False) #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\ mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome) for mapEachIntervalData in mapEachIntervalDataLs: for jobData in mapEachIntervalData.jobDataLs: self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job]) #add the reduction job to final stat merge job self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob]) return returnData
def openWriteBeagleFiles(self, pedigreeFamilyData=None, outputFnamePrefix=None): """ 2013.05.02 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M rs12082861 C C C C M rs4912233 T C C C M rs12732823 G A A A M rs17451521 C C C C M rs12033358 C T T T The likelihood version is marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524 Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000 Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000 Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000 The markers file has this format (markerID, position, alleleA, alleleB) Contig791:1086 1086 C A """ sys.stderr.write( "Opening beagle files (outputFnamePrefix =%s) to write ..." % (outputFnamePrefix)) familySize2BeagleFileHandler = {} familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList counter = 0 for familySize, sampleIDList in familySize2SampleIDList.items(): if familySize not in familySize2BeagleFileHandler: tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix, familySize) writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix), mode='w', delimiter=' ') familySize2BeagleFileHandler[familySize] = writer if familySize == 1: headerRow = ['marker', 'alleleA', 'alleleB'] else: headerRow = ['I', 'id'] for sampleID in sampleIDList: if familySize == 1: #likelihood format has sample name replicated three times, rather than 2 times headerRow.extend([sampleID] * 3) else: headerRow.extend([sampleID] * 2) writer.writeHeader(headerRow) counter += 1 markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix), mode='w', delimiter=' ') counter += 1 sys.stderr.write("%s files outputted.\n" % (counter)) return PassingData( familySize2BeagleFileHandler=familySize2BeagleFileHandler, markersFile=markersFile)
def parseQueryLocusID(self, locus_id=None): """ 2012.10.8 locus_id is in the format of '%s_%s_%s_positionInFlank%s'%(chromosome, start, stop, flankingLength+1) output of ExtractFlankingSequenceForVCFLoci.py """ search_result = ExtractFlankingSequenceForVCFLoci.sequenceTitlePattern.search( locus_id) chromosome = None start = None stop = None refBase = None altBase = None positionInFlank = None if search_result: chromosome = search_result.group(1) start = int(search_result.group(2)) stop = int(search_result.group(3)) refBase = search_result.group(4) altBase = search_result.group(5) positionInFlank = int(search_result.group(6)) return PassingData(chromosome=chromosome, start=start, stop=stop, refBase=refBase, altBase=altBase, positionInFlank=positionInFlank)
def reduce(self, passingData=None, reduceEachChromosomeDataLs=None, transferOutput=True, **keywords): """ #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] reduceOutputDirJob = passingData.reduceOutputDirJob realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci baseInputVolume = 200 * 20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value outputFile = File( os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv')) reduceJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, outputF=outputFile, parentJobLs=[reduceOutputDirJob], transferOutput=transferOutput, ) returnData.jobDataLs.append( PassingData(jobLs=[reduceJob], file=reduceJob.output, fileLs=[reduceJob.output])) for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs: for mapEachIntervalData in mapEachIntervalDataLs: self.addInputToMergeJob(reduceJob, \ parentJobLs=[mapEachIntervalData.mapJob]) return returnData
def run(self): """ in case chop the whole figure into blocks, swap col_block_index and row_block_index to make row first, column 2nd """ from palos.polymorphism.SNP import read_data from palos.utils import figureOutDelimiter, PassingData delimiter = figureOutDelimiter(self.input_fname) print(delimiter) header, row_label_ls1, row_label_ls2, data_matrix = read_data( self.input_fname, matrix_data_type=float, delimiter='\t') import numpy data_matrix = numpy.array(data_matrix) min_value = numpy.min(data_matrix) if self.min_value_non_negative and min_value < 0: min_value = 0 max_value = numpy.max(data_matrix) font = get_font(self.font_path, font_size=self.font_size) Value2Color.special_value2color[-2] = self.super_value_color value2color_func = lambda x: Value2Color.value2HSLcolor( x, min_value, max_value) im_legend = drawContinousLegend(min_value, max_value, self.no_of_ticks, value2color_func, font) fig_fname_prefix = os.path.splitext(self.fig_fname)[0] if self.split_legend_and_matrix: im_legend.save('%s_legend.png' % fig_fname_prefix) no_of_rows, no_of_cols = data_matrix.shape passParam = PassingData( value2color_func=value2color_func, im_legend=im_legend, font=font, split_legend_and_matrix=self.split_legend_and_matrix, no_grid=self.no_grid) if no_of_cols <= self.blockColUnit: self._drawMatrix(data_matrix, row_label_ls1, header[2:], self.fig_fname, passParam) else: #split into blocks no_of_col_blocks = no_of_cols / self.blockColUnit + 1 no_of_row_blocks = no_of_rows / self.blockRowUnit + 1 for i in range(no_of_col_blocks): col_start_index = i * self.blockColUnit col_end_index = (i + 1) * self.blockColUnit if col_start_index < no_of_cols: for j in range(no_of_row_blocks): row_start_index = j * self.blockRowUnit row_end_index = (j + 1) * self.blockRowUnit if row_start_index < no_of_rows: fig_fname = '%s_%s_%s.png' % (fig_fname_prefix, j, i) #row first, column 2nd self._drawMatrix( data_matrix[row_start_index:row_end_index, col_start_index:col_end_index], row_label_ls1[row_start_index:row_end_index], header[2 + col_start_index:2 + col_end_index], fig_fname, passParam)
def preReduce(self, passingData=None, transferOutput=True, **keywords): """ setup additional mkdir folder jobs, before mapEachAlignment, mapEachChromosome, mapReduceOneAlignment """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def initiatePassingData(self, ): """ this function gets called in the beginning of each fileWalker() (for each inputFname) """ pdata = PassingData(x_ls = [], y_ls = [], invariantPData=self.invariantPData) #2012.8.16 pass to global data self.invariantPData.y_ls = pdata.y_ls self.invariantPData.x_ls = pdata.x_ls return pdata
def reduce(self, passingData=None, reduceAfterEachAlignmentDataLs=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs return returnData
def reduceAfterEachChromosome(self, chromosome=None, passingData=None, transferOutput=True, mapEachIntervalDataLs=None, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs return returnData
def parseRow(self, row): """ """ start, stop, length, depth = row[:4] start = int(start) stop = int(stop) length = int(length) depth = float(depth) return PassingData(start=start, stop=stop, length=length, depth=depth)
def initiatePassingData(self, ): """ this function gets called in the beginning of each fileWalker() (for each inputFname). """ pdata = PassingData(x_ls=[], y_ls=[], z_ls=[], invariantPData=self.invariantPData) return pdata
def map(self, alignmentData=None, intervalData=None,\ VCFJobData=None, passingData=None, mapEachChromosomeData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def mapEachChromosome(self, alignmentData=None, chromosome=None,\ VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduceBeforeEachAlignment(self, passingData=None, transferOutput=True, **keywords): """ 2012.9 setup some reduce jobs before loop over all intervals of one alignment begins. these reduce jobs will collect stuff from each map() job. the link will be established in linkMapToReduce(). """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData