예제 #1
0
 def getFamilyStructure(self):
     """
     2013.07.19
     """
     sys.stderr.write("Finding unique pairs (singletons or groups) of parents ...\n ")
     noOfParents2FamilyData = {}
     for nodeID in self.pedigreeGraph:
         parents = self.pedigreeGraph.predecessors(nodeID)
         noOfParents = len(parents)
         if noOfParents not in noOfParents2FamilyData:
             noOfParents2FamilyData[noOfParents] = PassingData(
                 parentTupleSet=set(), parentIDSet=set(), childIDSet=set(),\
                 individualIDSet=set())
         parents.sort()
         noOfParents2FamilyData[noOfParents].parentTupleSet.add(tuple(parents))
         for parentID in parents:
             noOfParents2FamilyData[noOfParents].parentIDSet.add(parentID)
             noOfParents2FamilyData[noOfParents].individualIDSet.add(parentID)
         noOfParents2FamilyData[noOfParents].childIDSet.add(nodeID)
         noOfParents2FamilyData[noOfParents].individualIDSet.add(nodeID)
     
     noOfNuclearFamilies = noOfParents2FamilyData.get(2, 0)
     
     self._reportFamilyStructure(noOfParents2FamilyData)
     return PassingData(noOfParents2FamilyData=noOfParents2FamilyData)
예제 #2
0
    def run(self):
        """
        2011-7-11
        """
        self.setup_run()
        
        inputData = PassingData(jobDataLs = [])
        inputFile = self.registerOneInputFile(self.inputFname, folderName=self.pegasusFolderName)
        inputData.jobDataLs.append(PassingData(output=inputFile, jobLs=[]))
        noOfTotalSequences= self.getNoOfSequencesFromFasta(inputFastaFname=self.inputFname)
        
        registerReferenceData = self.registerBlastNucleotideDatabaseFile(
            ntDatabaseFname=self.databaseFname, \
            input_site_handler=self.input_site_handler)
        ntDatabaseFileList = registerReferenceData.refFastaFList
        ntDatabaseFile = ntDatabaseFileList[0]

        if len(ntDatabaseFileList)<4:	#some nt-database index file is missing
            sys.stderr.write("Adding blast-db-making job...")
            makeBlastDBJob = self.addMakeBlastDBJob(executable=self.formatdb,\
                inputFile=ntDatabaseFile, transferOutput=True)
            #add the index files to the ntDatabaseFileList
            ntDatabaseFileList = [ntDatabaseFile] + makeBlastDBJob.outputList
            sys.stderr.write(".\n")
        else:
            makeBlastDBJob = None
        
        self.addJobs(inputData=inputData, outputDirPrefix=self.pegasusFolderName,
            ntDatabaseFileList=ntDatabaseFileList, \
            noOfTotalSequences=noOfTotalSequences, \
            transferOutput=True, makeBlastDBJob=makeBlastDBJob)
        
        self.end_run()
예제 #3
0
    def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \
            transferOutput=True):
        """
        2012.6.27
        """
        sys.stderr.write("Adding wget jobs for %s input ... " %
                         (len(relativePathList)))
        no_of_jobs = 0

        topOutputDir = outputDir
        topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
        no_of_jobs += 1
        returnData = PassingData()
        returnData.jobDataLs = []

        for relativePath in relativePathList:
            #2013.06.26 remove all "/" from  relativePath in case it's a folder
            relativePathNoFolder = relativePath.replace('/', '_')
            logFile = File('%s.log' % (relativePathNoFolder))
            wgetJob = self.addWgetJob(executable=self.wget, url=inputURL,
                relativePath=relativePath, \
                username=username, password=password,\
                targetFolder=outputDir, logFile=logFile,
                cut_dir_number=self.cut_dir_number,
                parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \
                transferOutput=transferOutput, \
                extraArguments=None, job_max_memory=50)
            #include the tfam (outputList[1]) into the fileLs
            returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \
                fileLs=wgetJob.outputLs))
            no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))

        return returnData
	def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9):
		"""
		2013.12.04
		"""
		totalNoOfReads = 0
		noOfGoodReads = 0.0
		medianMapQ=-10
		mapQList=[]
		for alignedRead in alignedReadLs:
			totalNoOfReads +=1
			mapQList.append(alignedRead.mapq)
			if alignedRead.mapq>=minMapQGoodRead:
				noOfGoodReads += 1
			else:
				pass
		if totalNoOfReads>0:
			fractionOfGoodRead = noOfGoodReads/(totalNoOfReads)
			medianMapQ = numpy.median(mapQList)
		else:
			fractionOfGoodRead = -1
			medianMapQ = -10
			
		if fractionOfGoodRead>=minFractionOfGoodRead:
			locusLowMapQIndicator = 0
		else:
			locusLowMapQIndicator = 2
		return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \
						noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\
						medianMapQ=medianMapQ)
예제 #5
0
    def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]):
        """
        1. take mean/median/stdev of every cell in dataLs,
        2. modify newHeader to reflect that
        """
        print(f"Averaging key2dataLs ({len(key2dataLs)} entries ) ...",
              flush=True)
        newKey2DataLs = {}
        newHeader = []
        keyColHeader = header[:no_of_key_columns]
        valueColHeader = header[no_of_key_columns:]
        newValueColHeader = []
        no_of_value_columns = len(valueColHeader)
        for i in range(no_of_value_columns):
            valueColName = valueColHeader[i]
            newValueColHeader += [
                'mean_%s' % (valueColName),
                'median_%s' % (valueColName),
                'stdev_%s' % (valueColName)
            ]

        for key, dataLs in key2dataLs.items():
            if key not in newKey2DataLs:
                newKey2DataLs[key] = []
            no_of_value_columns = len(dataLs)
            for i in range(no_of_value_columns):
                meanValue = numpy.mean(dataLs[i])
                medianValue = numpy.median(dataLs[i])
                stdev = numpy.std(dataLs[i])
                newKey2DataLs[key] += [meanValue, medianValue, stdev]
        print(f"Done.", flush=True)
        return PassingData(key2dataLs=newKey2DataLs,
                           header=keyColHeader + newValueColHeader)
예제 #6
0
	def run(self):
		"""
		11-13-05 
			--db_connect()
			--parse_entrezgene_xml_file()
				--is_gi_valid_in_annot_assembly_table()
				--find_info_dict()
					--return_location_list()
				--submit_to_entrezgene_mapping_table()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.inputfiles))
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)	#2010-6-22
		session = db.session
		param_obj = PassingData(session=db.session, no_of_genes_already_in_db=0, no_of_entrezgene_mappings_already_in_db=0,\
					no_of_total=0, no_of_into_db=0, report=self.report, no_of_commentaries_already_in_db=0,\
					no_of_gene_segments_already_in_db=0, no_of_gene2go_already_in_db=0)
		for f in self.inputfiles:
			sys.stderr.write("%d/%d:\t%s\n"%(self.inputfiles.index(f)+1,len(self.inputfiles),f))
			self.parse_xml_file(session, f, tax_id=self.tax_id, param_obj=param_obj)
		
		session.flush()
		if self.commit:
			session.commit()
		else:
			session.rollback()
 def linkMapToReduce(self, mapEachIntervalData=None,
     preReduceReturnData=None, passingData=None, transferOutput=True, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
예제 #8
0
 def parseInputFile(self, inputFname=None, **keywords):
     """
     2013.08.23
         if a program is adding a file to db-affiliated storage,
          this is used for parsing.
     """
     return PassingData()
    def reduceEachInput(self,
                        chromosome=None,
                        passingData=None,
                        mapEachIntervalDataLs=None,
                        transferOutput=True,
                        **keywords):
        """
        2013.07.10
            #. concatenate all the sub-Inputs into one
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        returnData.mapEachIntervalDataLs = mapEachIntervalDataLs

        #intervalJobLs = [pdata for pdata in mapEachIntervalDataLs]
        """
        realInputVolume = passingData.jobData.file.noOfIndividuals * \
            passingData.jobData.file.noOfLoci
        baseInputVolume = 200*20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume, \
            baseInputVolume=baseInputVolume, baseJobPropertyValue=60,
            minJobPropertyValue=60, maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume, \
            baseInputVolume=baseInputVolume, baseJobPropertyValue=5000,
            minJobPropertyValue=5000, maxJobPropertyValue=10000).value
        """
        return returnData
 def readThroughAndProvideSummary(self):
     """
     2013.08.30
         called by vervet/src/db/import/AddAlignmentDepthIntervalFile2DB.py
     """
     col_name2index= self.smartReadHeader()
     if col_name2index is None:
         pdata = self.parseRow(self._row)
         self._postProcessParsedRowDataForSummary(pdata)
     
     for row in self:
         pdata = self.parseRow(row)
         self._postProcessParsedRowDataForSummary(pdata)
     
     
     self.min_interval_length = numpy.min(self.interval_length_ls)
     self.max_interval_length = numpy.max(self.interval_length_ls)
     self.median_interval_length = numpy.median(self.interval_length_ls)
     
     self.mean_interval_value=numpy.mean(self.interval_value_ls)
     self.median_interval_value=numpy.median(self.interval_value_ls)
     return PassingData(
         no_of_intervals=self.no_of_intervals,
         chromosome_size=self.chromosome_size,
         mean_interval_value=self.mean_interval_value,
         median_interval_value=self.median_interval_value,
         min_interval_value=self.min_interval_value,
         max_interval_value=self.max_interval_value,
         
         min_interval_length=self.min_interval_length,
         max_interval_length=self.max_interval_length,
         median_interval_length=self.median_interval_length)
    def mapEachAlignment(self, alignmentData=None,  passingData=None,
        transferOutput=True, **keywords):
        """
        2012.9.22
            similar to reduceBeforeEachAlignmentData() but
             for mapping programs that run on one alignment each.

            passingData.alignmentJobAndOutputLs = []
            passingData.bamFnamePrefix = bamFnamePrefix
            passingData.individual_alignment = alignment
        """
        returnData = PassingData(no_of_jobs = 0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob
        refFastaF = passingData.refFastaFList[0]

        alignment = alignmentData.alignment
        parentJobLs = alignmentData.jobLs
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF

        bamFnamePrefix = alignment.getReadGroup()

        return returnData
예제 #12
0
def estimateMeanStdFromData(dataVector=None, excludeTopFraction=0.2):
    """
	2012.10.14
		adapted from vervet/src/pedigree/DetectWrongLabelByCompKinshipVsIBD.DetectWrongLabelByCompKinshipVsIBD.estimateAbsDeltaMeanStd()
	2012.8.22
	"""
    sys.stderr.write("Estimating mean&std using the middle %.1f%% of data (n=%s) ..."%\
        ((1-excludeTopFraction)*100, len(dataVector)))
    noOfRows = len(dataVector)
    import numpy
    # 2012.8.22 draw some histogram to check what data looks like
    #		if len(dataVector)>10:
    #			outputFname = '%s_kinship_ibd_hist.png'%(self.outputFnamePrefix)
    #			yh_matplotlib.drawHist(dataVector, title='', \
    #							xlabel_1D="kinship-ibd", xticks=None, \
    #							outputFname=outputFname, min_no_of_data_points=10, \
    #							needLog=True, \
    #							dpi=200, min_no_of_bins=25)
    #dataVector = map(abs, dataVector)	#2012.8.23 no abs
    dataVector.sort()
    startIndex = min(0, int(len(dataVector) * (excludeTopFraction / 2)) - 1)
    stopIndex = int(len(dataVector) * (1 - excludeTopFraction / 2))
    dataVector = dataVector[startIndex:stopIndex]

    data_mean = numpy.mean(dataVector)
    data_std = numpy.std(dataVector)

    sys.stderr.write(" mean=%.3f, std=%.3f.\n" % (data_mean, data_std))
    return PassingData(mean=data_mean, std=data_std)
예제 #13
0
    def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None,
        FigureOutTaxID_ins=None):
        """
        2011-7-6
            
        possible header lines:
            
>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence


        """
        header = descriptionLine[1:-1]	#discard '>' and '\n'
        header = header.split('|')
        _tax_id = None
        # 1st type of clone description
        p_chromosome = re.compile(r'UNK clone ([^,]+),')
        # 2nd type of clone description
        p2_chromosome = re.compile(r'clone ([^,]+),')
        
        if p_chromosome.search(header[4]) is not None:
            chromosome = p_chromosome.search(header[4]).groups()[0]
        else:
            if p2_chromosome.search(header[4]) is not None:
                chromosome = p2_chromosome.search(header[4]).groups()[0]
            else:
                chromosome = None
        gi = int(header[1])
        acc_ver = header[3]
        comment = header[4]
        return PassingData(tax_id=_tax_id, gi=gi, comment=comment, 
            acc_ver=acc_ver, chromosome=chromosome)
예제 #14
0
    def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None,
        FigureOutTaxID_ins=None):
        """
        2011-7-6
            
        """
        """
        possible header lines:
        >Contig0  12652774 13406928

        """
        header = descriptionLine[1:-1]	#discard '>' and '\n'
        header = header.split()
        chromosome = header[0]	#contig name is taken as chromosome
        """
        p_chromosome = re.compile(r'Contig(\d+)')
        if p_chromosome.search(header[0]) is not None:
            chromosome = p_chromosome.search(header[0]).groups()[0]
        else:
            chromosome = None
        """
        gi = None
        acc_ver = None
        comment = None
        return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver,
            chromosome=chromosome)
예제 #15
0
 def parseFastaDescriptionForGenebank_hs37d5(self, 
     descriptionLine=None, FigureOutTaxID_ins=None):
     """
     >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
     >Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1
     >MT gi|251831106|ref|NC_012920.1| H**o sapiens mitochondrion, complete genome
     >GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1
     >GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1
     >NC_007605
     >hs37d5
     """
     header = descriptionLine[1:-1]
     headerList = header.split()        
     chromosome = headerList[0]
     comment = ' '.join(headerList[1:])
     gi = None
     acc_ver = None
     accitem = re.compile(r'supercontig')
     if accitem.search(header) is not None:
         acc_ver = headerList[0]
     else:	  
         commentSplit = comment.split("|")
         if(len(commentSplit) > 4):
             #deal with MT
             gi = int(commentSplit[1])
             acc_ver = commentSplit[3]
             comment = commentSplit[4]
     return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver,
         chromosome=chromosome)
예제 #16
0
 def parseFastaDescriptionForGenBank(self, descriptionLine=None,
     FigureOutTaxID_ins=None):
     """
     possible header lines:
 
 >gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
 >gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
 >gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
 >gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
     """
     #discard '>' and '\n'
     header = descriptionLine[1:-1]
     header = header.split('|')
     _tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
     
     if self.p_chromosome.search(header[4]) is not None:
         chromosome = self.p_chromosome.search(header[4]).groups()[0]
     elif header[4].find('mitochondrion')!=-1:
         chromosome = 'mitochondrion'
     elif header[4].find('chloroplast')!=-1:
         chromosome = 'chloroplast'
     else:	#something else, take the whole before ','
         chromosome = header[4].split(',')[0]
     gi = int(header[1])
     acc_ver = header[3]
     comment = header[4]
     return PassingData(tax_id=_tax_id, gi=gi, comment=comment, 
         acc_ver=acc_ver, chromosome=chromosome)
예제 #17
0
 def reduceEachChromosome(self, chromosome=None, passingData=None,
     mapEachInputDataLs=None, 
     chromosome2mapEachIntervalDataLs=None,\
     reduceEachInputDataLs=None,\
     transferOutput=True, \
     **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachInputDataLs = mapEachInputDataLs
     returnData.reduceEachInputDataLs = reduceEachInputDataLs
     #reduce matrix by chosen column and average p-value
     
     outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output,
         'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome)))
     reduceChromosomeJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=outputFile, \
         parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \
         extraDependentInputLs=None, transferOutput=False)
         #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\
     mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome)
     for mapEachIntervalData in mapEachIntervalDataLs:
         for jobData in mapEachIntervalData.jobDataLs:
             self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job])
         
     #add the reduction job to final stat merge job
     self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob])
     
     return returnData
    def openWriteBeagleFiles(self,
                             pedigreeFamilyData=None,
                             outputFnamePrefix=None):
        """
        2013.05.02
            
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M rs12082861 C C C C
            M rs4912233 T C C C
            M rs12732823 G A A A
            M rs17451521 C C C C
            M rs12033358 C T T T
        
        The likelihood version is
            marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524
            Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000
            Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000
            Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000
        
        The markers file has this format (markerID, position, alleleA, alleleB)
            Contig791:1086 1086 C A
        """
        sys.stderr.write(
            "Opening beagle files (outputFnamePrefix =%s) to write ..." %
            (outputFnamePrefix))
        familySize2BeagleFileHandler = {}
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList
        counter = 0
        for familySize, sampleIDList in familySize2SampleIDList.items():
            if familySize not in familySize2BeagleFileHandler:
                tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix,
                                                            familySize)
                writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix),
                                    mode='w',
                                    delimiter=' ')
                familySize2BeagleFileHandler[familySize] = writer
                if familySize == 1:
                    headerRow = ['marker', 'alleleA', 'alleleB']
                else:
                    headerRow = ['I', 'id']
                for sampleID in sampleIDList:
                    if familySize == 1:
                        #likelihood format has sample name replicated three times, rather than 2 times
                        headerRow.extend([sampleID] * 3)
                    else:
                        headerRow.extend([sampleID] * 2)
                writer.writeHeader(headerRow)
                counter += 1
        markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix),
                                 mode='w',
                                 delimiter=' ')

        counter += 1
        sys.stderr.write("%s files outputted.\n" % (counter))

        return PassingData(
            familySize2BeagleFileHandler=familySize2BeagleFileHandler,
            markersFile=markersFile)
    def parseQueryLocusID(self, locus_id=None):
        """
        2012.10.8
            locus_id is in the format of '%s_%s_%s_positionInFlank%s'%(chromosome, start, stop, flankingLength+1)
            output of ExtractFlankingSequenceForVCFLoci.py
        """
        search_result = ExtractFlankingSequenceForVCFLoci.sequenceTitlePattern.search(
            locus_id)
        chromosome = None
        start = None
        stop = None
        refBase = None
        altBase = None
        positionInFlank = None
        if search_result:
            chromosome = search_result.group(1)
            start = int(search_result.group(2))
            stop = int(search_result.group(3))
            refBase = search_result.group(4)
            altBase = search_result.group(5)
            positionInFlank = int(search_result.group(6))

        return PassingData(chromosome=chromosome,
                           start=start,
                           stop=stop,
                           refBase=refBase,
                           altBase=altBase,
                           positionInFlank=positionInFlank)
예제 #20
0
    def reduce(self,
               passingData=None,
               reduceEachChromosomeDataLs=None,
               transferOutput=True,
               **keywords):
        """
        #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one
        
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        reduceOutputDirJob = passingData.reduceOutputDirJob

        realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
        baseInputVolume = 200 * 20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=60,
            minJobPropertyValue=60,
            maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=5000,
            minJobPropertyValue=5000,
            maxJobPropertyValue=10000).value

        outputFile = File(
            os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv'))
        reduceJob = self.addStatMergeJob(
            statMergeProgram=self.mergeSameHeaderTablesIntoOne,
            outputF=outputFile,
            parentJobLs=[reduceOutputDirJob],
            transferOutput=transferOutput,
        )
        returnData.jobDataLs.append(
            PassingData(jobLs=[reduceJob],
                        file=reduceJob.output,
                        fileLs=[reduceJob.output]))

        for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs:
            for mapEachIntervalData in mapEachIntervalDataLs:
                self.addInputToMergeJob(reduceJob, \
                        parentJobLs=[mapEachIntervalData.mapJob])

        return returnData
예제 #21
0
    def run(self):
        """
        in case chop the whole figure into blocks, swap col_block_index and
            row_block_index to make row first, column 2nd
        """
        from palos.polymorphism.SNP import read_data
        from palos.utils import figureOutDelimiter, PassingData
        delimiter = figureOutDelimiter(self.input_fname)
        print(delimiter)
        header, row_label_ls1, row_label_ls2, data_matrix = read_data(
            self.input_fname, matrix_data_type=float, delimiter='\t')
        import numpy
        data_matrix = numpy.array(data_matrix)
        min_value = numpy.min(data_matrix)
        if self.min_value_non_negative and min_value < 0:
            min_value = 0
        max_value = numpy.max(data_matrix)
        font = get_font(self.font_path, font_size=self.font_size)
        Value2Color.special_value2color[-2] = self.super_value_color
        value2color_func = lambda x: Value2Color.value2HSLcolor(
            x, min_value, max_value)
        im_legend = drawContinousLegend(min_value, max_value, self.no_of_ticks,
                                        value2color_func, font)

        fig_fname_prefix = os.path.splitext(self.fig_fname)[0]
        if self.split_legend_and_matrix:
            im_legend.save('%s_legend.png' % fig_fname_prefix)

        no_of_rows, no_of_cols = data_matrix.shape
        passParam = PassingData(
            value2color_func=value2color_func,
            im_legend=im_legend,
            font=font,
            split_legend_and_matrix=self.split_legend_and_matrix,
            no_grid=self.no_grid)

        if no_of_cols <= self.blockColUnit:
            self._drawMatrix(data_matrix, row_label_ls1, header[2:],
                             self.fig_fname, passParam)
        else:  #split into blocks
            no_of_col_blocks = no_of_cols / self.blockColUnit + 1
            no_of_row_blocks = no_of_rows / self.blockRowUnit + 1
            for i in range(no_of_col_blocks):
                col_start_index = i * self.blockColUnit
                col_end_index = (i + 1) * self.blockColUnit
                if col_start_index < no_of_cols:
                    for j in range(no_of_row_blocks):
                        row_start_index = j * self.blockRowUnit
                        row_end_index = (j + 1) * self.blockRowUnit
                        if row_start_index < no_of_rows:
                            fig_fname = '%s_%s_%s.png' % (fig_fname_prefix, j,
                                                          i)
                            #row first, column 2nd
                            self._drawMatrix(
                                data_matrix[row_start_index:row_end_index,
                                            col_start_index:col_end_index],
                                row_label_ls1[row_start_index:row_end_index],
                                header[2 + col_start_index:2 + col_end_index],
                                fig_fname, passParam)
 def preReduce(self, passingData=None, transferOutput=True, **keywords):
     """
     setup additional mkdir folder jobs, before mapEachAlignment,
         mapEachChromosome, mapReduceOneAlignment
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
예제 #23
0
 def initiatePassingData(self, ):
     """
     this function gets called in the beginning of each fileWalker() (for each inputFname)
     """
     pdata = PassingData(x_ls = [], y_ls = [], invariantPData=self.invariantPData)
     #2012.8.16 pass to global data
     self.invariantPData.y_ls = pdata.y_ls
     self.invariantPData.x_ls = pdata.x_ls
     return pdata
 def reduce(self, passingData=None, reduceAfterEachAlignmentDataLs=None,
         transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs
     return returnData
 def reduceAfterEachChromosome(self, chromosome=None, passingData=None,
     transferOutput=True,
     mapEachIntervalDataLs=None, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachIntervalDataLs = mapEachIntervalDataLs
     return returnData
 def parseRow(self, row):
     """
     """
     start, stop, length, depth = row[:4]
     start = int(start)
     stop = int(stop)
     length = int(length)
     depth = float(depth)
     return PassingData(start=start, stop=stop, length=length, depth=depth)
예제 #27
0
 def initiatePassingData(self, ):
     """
     this function gets called in the beginning of each fileWalker() (for each inputFname).
     """
     pdata = PassingData(x_ls=[],
                         y_ls=[],
                         z_ls=[],
                         invariantPData=self.invariantPData)
     return pdata
 def map(self, alignmentData=None, intervalData=None,\
     VCFJobData=None, passingData=None,
     mapEachChromosomeData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def mapEachChromosome(self, alignmentData=None, chromosome=None,\
     VCFJobData=None, passingData=None,
     reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def reduceBeforeEachAlignment(self, passingData=None,
     transferOutput=True, **keywords):
     """
     2012.9 setup some reduce jobs before loop over all intervals of one alignment begins.
         these reduce jobs will collect stuff from each map() job.
         the link will be established in linkMapToReduce().
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData