Пример #1
0
def makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1):

    '''
    makes the fold table and writes to disk
    fold table is ranked by fold change
    first column is guideID, second column is gene name, third is fold change
    '''

    guideDict,geneDict = makeAnnotDict(annotFile)

    testIdx = utils.parseTable(testIdxFile,'\t')
    controlIdx = utils.parseTable(controlIdxFile,'\t')

    #for each guide, divide the count by the MMR then add 1 then take the log2 ratio

    outTable = [['GUIDE_ID','GENE','LOG2_RATIO',testName,controlName]]
    for i in range(len(testIdx)):

        guideID = testIdx[i][0]
        gene = guideDict[guideID]
        
        testCount = float(testIdx[i][2])/testMMR + epsilon
        controlCount = float(controlIdx[i][2])/controlMMR + epsilon

        log2Ratio = numpy.log2(testCount/controlCount)

        newLine = [guideID,gene,log2Ratio,round(testCount,4),round(controlCount,4)]

        outTable.append(newLine)

    outputFile = '%s%s_log2Ratio.txt' % (outputFolder,analysisName)
    utils.unParseTable(outTable,outputFile,'\t')
    return outputFile
Пример #2
0
def mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder):

    print("MAPPING TO THE FOLLOWING BAMS:")

    for bamFile in bamFileList:
        print(bamFile)
        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, analysisName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (splitGFFPath, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (splitGFFPath, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (splitGFFPath, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (splitGFFPath, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    
    #now we make a signal table
    #set up the table using the first bam
    if len(bamFileList) > 1:

        #set up the first pass at the table
        signalTable = [['REGION_ID','locusLine'] + [name.split('/')[-1] for name in bamFileList]]
        bamFileName = bamFileList[0].split('/')[-1]
        mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t')
        for i in range(1,len(mappedTable)):
            signalTable.append(mappedTable[i])

        for bamFile in bamFileList[1:]:
            bamFileName = bamFile.split('/')[-1]
            
            mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t')
            
            for i in range(1,len(mappedTable[i])):
                mapSignal = mappedTable[i][2]
                signalTable[i].append(mapSignal)
    else:
        bamFileName = bamFileList[0].split('/')[-1]
        signalTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t')

    return(signalTable)
Пример #3
0
def addLengths(gene_table_path, peak_table_path):
    '''
    add tss and distal lengths to a gene table
    using the peak table
    '''

    output_path = string.replace(gene_table_path, 'GENE_TABLE',
                                 'GENE_TABLE_LENGTH')

    print(output_path)

    tss_dict = defaultdict(int)
    distal_dict = defaultdict(int)

    peak_table = utils.parseTable(peak_table_path, '\t')
    for line in peak_table[1:]:
        #get the genes
        gene_list = []
        if len(line) == 15:
            gene_list += line[-1].split(',')
            gene_list += line[-2].split(',')
        elif len(line) == 14:
            gene_list += line[-1].split(',')
        else:
            continue

        gene_list = utils.uniquify(
            [gene for gene in gene_list if len(gene) > 0])

        for gene in gene_list:
            if int(line[5]) == 1:
                tss_dict[gene] += int(line[4])
            else:
                distal_dict[gene] += int(line[4])

    #now fill out the gene table
    gene_table = utils.parseTable(gene_table_path, '\t')

    output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']]

    for line in gene_table[1:]:
        gene = line[0]
        new_line = line + [tss_dict[gene], distal_dict[gene]]
        output_table.append(new_line)

    utils.unParseTable(output_table, output_path, '\t')

    return output_path
Пример #4
0
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder):

    '''
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    '''
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    

    #check to make sure this hasn't been done yet
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)

    try:
        foo = utils.parseTable(roseOutput,'\t')
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    except IOError:
        
        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2)
        mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile)

        #call rose on the merged shit    


        roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder)
        print('i can has rose bash file %s' % (roseBashFile))

        #run the bash command
        os.system('bash %s' % (roseBashFile))

        #check for and return output
        if utils.checkOutput(roseOutput,1,30):
            return roseOutput
        else:
            print "ERROR: ROSE CALL ON MERGED REGIONS FAILED"
            sys.exit()
Пример #5
0
def makeSEDict(enhancerFile, name, superOnly=True):
    '''
    makes an attribute dict for enhancers keyed by uniqueID
    '''

    seDict = {}
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    superLoci = []
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        if line[0][0] == 'R':
            header = line
            supColumn = header.index('isSuper')
            continue
        if superOnly:
            if int(line[supColumn]) == 1:

                rank = int(line[-2])
                enhancerID = name + '_' + line[0]
                seDict[enhancerID] = {'rank': rank}

        else:

            signal = float(line[6]) - float(line[7])
            rank = int(line[-2])
            enhancerID = name + '_' + line[0]
            seDict[enhancerID] = {'rank': rank}

    return seDict
Пример #6
0
def makeSEDict(enhancerFile, name, superOnly=True):

    """
    makes an attribute dict for enhancers keyed by uniqueID
    """

    seDict = {}
    enhancerTable = utils.parseTable(enhancerFile, "\t")

    superLoci = []
    for line in enhancerTable:
        if line[0][0] == "#":
            continue
        if line[0][0] == "R":
            header = line
            supColumn = header.index("isSuper")
            continue
        if superOnly:
            if int(line[supColumn]) == 1:

                signal = float(line[6]) - float(line[7])
                rank = int(line[-2])
                enhancerID = name + "_" + line[0]
                seDict[enhancerID] = {"rank": rank, "signal": signal}

        else:

            signal = float(line[6]) - float(line[7])
            rank = int(line[-2])
            enhancerID = name + "_" + line[0]
            seDict[enhancerID] = {"rank": rank, "signal": signal}

    return seDict
Пример #7
0
def makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder):

    '''
    makes a table where each row is an enhancer and each column is the log2 
    background corrected signal vs. median
    '''

    #load in the region map
    regionMap = utils.parseTable(mergedRegionMap,'\t')
    namesList = medianDict.keys()
    signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList]
    for line in regionMap[1:]:

        newLine = line[0:6]
        for i in range(len(namesList)):
            enhancerIndex = (i*2) + 6
            controlIndex = (i*2) + 7
            enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex])
            if enhancerSignal < 0:
                enhancerSignal = 0
            enhancerSignal = enhancerSignal/medianDict[namesList[i]]
            newLine.append(enhancerSignal)

        signalTable.append(newLine)

    outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName)
    print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
    utils.unParseTable(signalTable,outputFile,'\t')
    return outputFile
Пример #8
0
def collapseRegionMap(regionMapFile, name='', controlBams=False):
    '''
    takes a regionMap file and collapses signal into a single column
    also fixes any stupid start/stop sorting issues
    needs to take into account whether or not controls were used
    '''

    regionMap = utils.parseTable(regionMapFile, '\t')

    for n, line in enumerate(regionMap):

        if n == 0:
            #new header
            if len(name) == 0:
                name = 'MERGED_SIGNAL'
            regionMap[n] = line[0:6] + [name]

        else:
            newLine = list(line[0:6])
            if controlBams:
                signalLine = [float(x) for x in line[6:]]
                rankbyIndexes = range(0, len(signalLine) / 2, 1)
                controlIndexes = range(len(signalLine) / 2, len(signalLine), 1)
                metaVector = []
                for i, j in zip(rankbyIndexes, controlIndexes):
                    #min signal is 0
                    metaVector.append(max(0, signalLine[i] - signalLine[j]))
                metaSignal = numpy.mean(metaVector)
            else:
                metaSignal = numpy.mean([float(x) for x in line[6:]])
            regionMap[n] = newLine + [metaSignal]

    outputFile = string.replace(regionMapFile, 'REGION', 'META')
    utils.unParseTable(regionMap, outputFile, '\t')
    return (outputFile)
Пример #9
0
def makeBedCollection(bedFileList):
    '''
    takes in a list of bedFiles and makes a single huge collection
    each locus has as its ID the name of the bed file
    '''

    bedLoci = []
    print("MAKING BED COLLECTION FOR:")
    for bedFile in bedFileList:

        bedName = bedFile.split('/')[-1].split('.')[0]
        print(bedName)
        bed = utils.parseTable(bedFile, '\t')
        for line in bed:
            if len(line) >= 3:
                #check that line[0]
                if line[0][0:3] == 'chr':
                    try:
                        coords = [int(line[1]), int(line[2])]
                        bedLocus = utils.Locus(line[0], min(coords),
                                               max(coords), '.', bedName)
                        bedLoci.append(bedLocus)

                    except ValueError:
                        pass

        print("IDENTIFIED %s BED REGIONS" % (len(bedLoci)))

    return utils.LocusCollection(bedLoci, 50)
Пример #10
0
def makeSEDict(enhancerFile,name,superOnly = True):

    '''
    makes an attribute dict for enhancers keyed by uniqueID
    '''

    seDict = {}
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    superLoci = []
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        if line[0][0] == 'R':
            header = line
            supColumn = header.index('isSuper')
            continue
        if superOnly:
            if int(line[supColumn]) == 1:
                
                signal = float(line[6]) - float(line[7])
                rank = int(line[-2])
                enhancerID = name+'_'+line[0]
                seDict[enhancerID] = {'rank':rank,'signal':signal}

        else:

            signal = float(line[6]) - float(line[7])
            rank = int(line[-2])
            enhancerID = name+'_'+line[0]
            seDict[enhancerID] = {'rank':rank,'signal':signal}

    return seDict
Пример #11
0
def makeBedCollection(bedFileList):
    '''
    takes in a list of bedFiles and makes a single huge collection
    each locus has as its ID the name of the bed file
    '''

    bedLoci = []
    print("MAKING BED COLLECTION FOR:")
    for bedFile in bedFileList:

        bedName = bedFile.split('/')[-1].split('.')[0]
        print(bedName)
        bed = utils.parseTable(bedFile, '\t')
        for line in bed:
            if len(line) >= 3:
                #check that line[0]
                if line[0][0:3] == 'chr':
                    try:
                        coords = [int(line[1]),int(line[2])]
                        bedLocus = utils.Locus(line[0], min(coords), max(coords), '.', bedName)
                        bedLoci.append(bedLocus)

                    except ValueError:
                        pass

        print("IDENTIFIED %s BED REGIONS" % (len(bedLoci)))

    return utils.LocusCollection(bedLoci, 50)
Пример #12
0
def loadGenome(genome_build,config_file = ''):

    '''
    loads annotation for a genome into a genome object
    '''

    #this nested dictionary has all of the useful information and likely will have to be
    #edited so it can be configured any time
    genome_build = string.upper(genome_build)

        
    genomeDict = {
        'HG19':{'annot_file':'%sannotation/hg19_refseq.ucsc' % (pipeline_dir),
                'genome_directory':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/',
                'tf_file':'%s/annotation/TFlist_NMid_hg19.txt' % (whereAmI),     
                'mask_file':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed',
                'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI),
                'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI),
                },
        'RN6':{'annot_file':'%sannotation/rn6_refseq.ucsc' % (pipeline_dir),
                'genome_directory':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/',
                'tf_file':'%s/annotation/TFlist_NMid_rn6.txt' % (whereAmI),      
                'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI),
                'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI),
                },
        'MM10':{'annot_file':'%sannotation/mm10_refseq.ucsc' % (pipeline_dir),
                'genome_directory':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/',
                'tf_file':'%s/annotation/TFlist_NMid_mm10.txt' % (whereAmI),
                'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI),
                'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI),
                }

        }

    #allow an optional config file to overwrite default paths
    if len(config_file) >0:
        config_table = utils.parseTable(config_file,'\t')
        for line in config_table[1:]:
            (build,field,feature_path) = line[0].split(':')
            genomeDict[string.upper(build)][string.lower(field)] = feature_path
    
    if genome_build not in genomeDict:
        print('ERROR: UNSUPPORTED GENOME BUILD %s. EXITING NOW' % (genome_build))
        sys.exit()
    else:
        print('USING BUILD %s WITH FOLLOWING FIELDS:' % (genome_build))
        print(genomeDict[genome_build])

    #now attempt to load the genome
    genome = Genome(genome_build,genomeDict[genome_build]['genome_directory'],genomeDict[genome_build]['annot_file'])

    #adding additional optional features
    genome.addFeature('tf_file',genomeDict[genome_build]['tf_file'])
    if genome_build == 'HG19':
        genome.addFeature('mask',genomeDict[genome_build]['mask_file'])
    genome.addFeature('motif_convert',genomeDict[genome_build]['motif_convert'])
    genome.addFeature('motif_database',genomeDict[genome_build]['motif_database'])

    return genome
Пример #13
0
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''):
    '''
    input: an activity table with refseq in first column and expression or promoter
    acetylation in second column
    output: a dictionary keyed by refseq that points to activity
    '''

    print 'CREATING EXPRESSION DICTIONARY'

    if not expressionFile:
        expressionFilename = projectFolder + 'bamliquidator/matrix.txt'
    else:
        expressionFilename = expressionFile
        
    expressionTable = utils.parseTable(expressionFilename, '\t')

    expressionDictNM = {}
    expressionDictGene = {}

    for line in expressionTable[1:]:
        trid = line[0]
        geneName = refseqToNameDict[trid]
        try:
            exp = float(line[2])
        except IndexError:
            exp = float(line[1])

        # Save the expression value of each NMid in a dict, keep higher value if multiple
        if trid in expressionDictNM and exp > expressionDictNM[trid]:
            expressionDictNM[trid] = exp
        elif trid not in expressionDictNM:
            expressionDictNM[trid] = exp

        # Save the value of the expression if it's the highest for that gene
        if geneName in expressionDictGene and exp > expressionDictGene[geneName]:
            expressionDictGene[geneName] = exp
        elif geneName not in expressionDictGene:
            expressionDictGene[geneName] = exp

    cutoff = numpy.percentile(expressionDictGene.values(), expCutoff)
    print 'Expression cutoff: ' + str(cutoff)

    expressedGenes = []
    expressedNM = []

    for nmid in expressionDictNM:
        if float(expressionDictNM[nmid]) > cutoff:
            expressedGenes.append(refseqToNameDict[nmid])
            expressedNM.append(nmid)

    expressedGenes = utils.uniquify(expressedGenes)
    Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt'
    utils.unParseTable(expressedGenes, Genefilename, '')

    expressedNM = utils.uniquify(expressedNM)
    NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt'
    utils.unParseTable(expressedNM, NMfilename, '')
      
    return expressedNM, expressionDictNM
Пример #14
0
def scoreValley(locus, bamFile, projectName, projectFolder):
    '''
    calculate valley scores for a locus
    based on this refernce:
    http://bioinformatics.oxfordjournals.org/content/26/17/2071.full
    '''

    nbins = locus.len() / 10

    #call bamliquidator on the region and store in a temp file
    os.system('bamliquidator ' + bamFile + ' ' + locus.chr() + ' ' +
              str(locus.start()) + ' ' + str(locus.end()) + ' . ' +
              str(nbins) + ' 0 > ' + projectFolder + 'tempBamliquidator_' +
              projectName + '.txt')

    x = utils.parseTable(
        projectFolder + 'tempBamliquidator_' + projectName + '.txt', '\t')
    density = [int(y[0]) for y in x]
    smoothDensity = gaussianSmooth(density, 5)

    scoreArray = []
    regionMax = max(smoothDensity)

    #Now take the smooth reads and calaculate a valley score

    for i in range(len(smoothDensity)):
        score = 0
        try:
            leftmax = max(smoothDensity[i - 25:i - 10])
        except:
            leftmax = 'edge'
        try:
            rightmax = max(smoothDensity[i + 10:i + 25])
        except:
            rightmax = 'edge'

        if rightmax == 'edge' and leftmax == 'edge':
            shoulderHeightMin = 0
            shoulderHeightMax = 0
        elif leftmax == 'edge':
            shoulderHeightMin = rightmax
            shoulderHeightMax = rightmax
        elif rightmax == 'edge':
            shoulderHeightMin = leftmax
            shoulderHeightMax = leftmax
        else:
            shoulderHeightMin = min(leftmax, rightmax)
            shoulderHeightMax = max(leftmax, rightmax)

        ratio = (shoulderHeightMax - float(smoothDensity[i])) / regionMax
        if ratio > 0.3:
            score = 1
        else:
            score = 0

        scoreArray.append(score)

    return scoreArray
Пример #15
0
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict,
               canidateGenes):
    '''
    import the FIMO output once it's finished
    build the networkX directed graph
    '''

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}
    motifNames = [line[1] for line in motifDatabase]

    # The reverse of the other dict, from motif name to gene name
    for line in motifDatabase:
        motifDatabaseDict[line[0]] = line[1]

    fimoFile = projectFolder + 'FIMO/fimo.txt'
    fimoTable = utils.parseTable(fimoFile, '\t')

    graph = nx.DiGraph(name=projectName)
    graph.add_nodes_from(canidateGenes)

    motifDict = defaultdict(list)
    for line in fimoTable[1:]:

        source = motifDatabaseDict[line[0]]  #motifId
        # region = line[1].split('|')
        region = line[2].split('|')
        target = refseqToNameDict[
            region[0]]  #gene name corresponding to the NMid
        graph.add_edge(source, target)
        # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3])))
        motifDict[source].append((region[1], int(region[2]) + int(line[3]),
                                  int(region[2]) + int(line[4])))

    utils.formatFolder(projectFolder + 'motifBED/', True)
    for gene in motifDict.keys():
        if motifDict[gene]:
            bed = []
            for loc in motifDict[gene]:
                bed.append([loc[0], loc[1], loc[2]])

            filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed'
            utils.unParseTable(bed, filename, '\t')

    return graph
Пример #16
0
def makeFoldTable(annotFile,
                  analysisName,
                  testName,
                  controlName,
                  testMMR,
                  controlMMR,
                  testIdxFile,
                  controlIdxFile,
                  outputFolder,
                  epsilon=1):
    '''
    makes the fold table and writes to disk
    fold table is ranked by fold change
    first column is guideID, second column is gene name, third is fold change
    '''

    guideDict, geneDict = makeAnnotDict(annotFile)

    testIdx = utils.parseTable(testIdxFile, '\t')
    controlIdx = utils.parseTable(controlIdxFile, '\t')

    #for each guide, divide the count by the MMR then add 1 then take the log2 ratio

    outTable = [['GUIDE_ID', 'GENE', 'LOG2_RATIO', testName, controlName]]
    for i in range(len(testIdx)):

        guideID = testIdx[i][0]
        gene = guideDict[guideID]

        testCount = float(testIdx[i][2]) / testMMR + epsilon
        controlCount = float(controlIdx[i][2]) / controlMMR + epsilon

        log2Ratio = numpy.log2(testCount / controlCount)

        newLine = [
            guideID, gene, log2Ratio,
            round(testCount, 4),
            round(controlCount, 4)
        ]

        outTable.append(newLine)

    outputFile = '%s%s_log2Ratio.txt' % (outputFolder, analysisName)
    utils.unParseTable(outTable, outputFile, '\t')
    return outputFile
Пример #17
0
def makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict,
                            analysisName, genome, outputFolder):
    '''
    makes a table where each row is an enhancer and each column is the log2 
    background corrected signal vs. median
    '''

    #load in the region map
    regionMap = utils.parseTable(mergedRegionMap, '\t')
    namesList = nameDict.keys()
    namesList.sort()
    signalTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE'
    ] + namesList]

    print("len of %s for namesList" % (len(namesList)))
    print(namesList)
    for line in regionMap[1:]:

        newLine = line[0:6]

        #a little tricky here to add datasets sequentially
        i = 6  #start w/ the first column w/ data
        for name in namesList:

            if nameDict[name]['background'] == True:
                enhancerIndex = int(i)
                i += 1
                controlIndex = int(i)
                i += 1
                try:
                    enhancerSignal = float(line[enhancerIndex]) - float(
                        line[controlIndex])
                except IndexError:
                    print line
                    print len(line)
                    print enhancerIndex
                    print controlIndex
                    sys.exit()

            else:
                enhancerIndex = int(i)
                i += 1
                enhancerSignal = float(line[enhancerIndex])

            if enhancerSignal < 0:
                enhancerSignal = 0
            enhancerSignal = enhancerSignal / medianDict[name]
            newLine.append(enhancerSignal)

        signalTable.append(newLine)

    outputFile = "%s%s_%s_signalTable.txt" % (outputFolder, genome,
                                              analysisName)
    print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
    utils.unParseTable(signalTable, outputFile, '\t')
    return outputFile
Пример #18
0
def makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder):

    '''
    makes a table where each row is an enhancer and each column is the log2 
    background corrected signal vs. median
    '''

    #load in the region map
    regionMap = utils.parseTable(mergedRegionMap,'\t')
    namesList = nameDict.keys()
    namesList.sort()
    signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList]

    print("len of %s for namesList" % (len(namesList)))
    print(namesList)
    for line in regionMap[1:]:

        newLine = line[0:6]
        
        
        #a little tricky here to add datasets sequentially
        i = 6 #start w/ the first column w/ data
        for name in namesList:
            
            if nameDict[name]['background'] == True:
                enhancerIndex = int(i)
                i +=1
                controlIndex = int(i)
                i +=1
                try:
                    enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex])
                except IndexError:
                    print line
                    print len(line)
                    print enhancerIndex
                    print controlIndex
                    sys.exit()
                
            else:
                enhancerIndex = int(i)
                i+=1
                enhancerSignal = float(line[enhancerIndex])

            if enhancerSignal < 0:
                enhancerSignal = 0
            enhancerSignal = enhancerSignal/medianDict[name]
            newLine.append(enhancerSignal)
                
            


        signalTable.append(newLine)

    outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName)
    print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
    utils.unParseTable(signalTable,outputFile,'\t')
    return outputFile
Пример #19
0
def summarizeVenn(mapped_path, group_list=['CG', 'THMYCN'], output=''):
    '''
    summarizes binary occupancy across group to make a venn diagram
    '''

    group_table = [['GFF_LINE', 'ID'] + group_list]

    mapped_table = utils.parseTable(mapped_path, '\t')

    group_cols = []
    for group in group_list:
        group_names = [
            name for name in mapped_table[0] if name.count(group) > 0
        ]
        group_cols.append(
            [mapped_table[0].index(name) for name in group_names])

    print(group_cols)
    for line in mapped_table[1:]:
        binary_vector = []  #a 1/0 vector to hold mapping by group
        for i in range(len(group_list)):
            cols = group_cols[i]
            signal = max([int(line[x]) for x in cols])
            binary_vector.append(signal)

        new_line = line[0:2] + binary_vector
        group_table.append(new_line)

    print(group_table[0:5])

    #now add up the stats
    #this part assumes only 2 groups for now otherwise gets combinatorially challenging
    #permute all possible binary combinations given the vector length
    binary_combinations = [[0], [1]]
    for i in range(len(group_list) - 1):
        new_combinations = []
        for x in binary_combinations:
            print(x)
            x1 = list(x) + [1]
            x0 = list(x) + [0]
            new_combinations.append(x1)
            new_combinations.append(x0)

            binary_combinations = list(new_combinations)

    print(binary_combinations)
    count_table = [group_list + ['count']]
    for combo in binary_combinations:
        count = len([line for line in group_table[1:] if line[2:] == combo])

        count_table.append(combo + [count])
    print(count_table)
    if len(output) > 0:
        utils.unParseTable(count_table, output, '\t')
    else:
        return count_table
Пример #20
0
def assignEnhancerRank(enhancerToGeneFile,
                       enhancerFile1,
                       enhancerFile2,
                       name1,
                       name2,
                       rankOutput=''):
    '''
    for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables
    '''

    enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t')

    enhancerCollection1 = makeSECollection(enhancerFile1, name1, False)
    enhancerCollection2 = makeSECollection(enhancerFile2, name2, False)

    enhancerDict1 = makeSEDict(enhancerFile1, name1, False)
    enhancerDict2 = makeSEDict(enhancerFile2, name2, False)

    #we're going to update the enhancerToGeneTable

    enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2]

    for i in range(1, len(enhancerToGene)):

        line = enhancerToGene[i]

        locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0])

        #if the enhancer doesn't exist, its ranking is dead last on the enhancer list

        enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both')
        if len(enhancer1Overlap) == 0:
            enhancer1Rank = len(enhancerCollection1)
        else:

            rankList1 = [
                enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap
            ]
            enhancer1Rank = min(rankList1)

        enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both')
        if len(enhancer2Overlap) == 0:
            enhancer2Rank = len(enhancerCollection2)
        else:

            rankList2 = [
                enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap
            ]
            enhancer2Rank = min(rankList2)
        enhancerToGene[i] += [enhancer1Rank, enhancer2Rank]

    if len(rankOutput) == 0:
        return enhancerToGene
    else:
        utils.unParseTable(enhancerToGene, rankOutput, '\t')
Пример #21
0
def findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile,
               motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''

    # Create a dictionary to call motif names keyed on gene names

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}
    motifNames = [line[1] for line in motifDatabase]
    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])
    print 'GENERATING TF NETWORK'

    # select the TF candidates that have motifs
    canidateMotifs = []
    for gene in canidateGenes:
        if gene in motifNames:
            canidateMotifs.append(gene)

    print 'Number of annotated canidate TFs that have motifs: ' + str(
        len(canidateMotifs))
    canidateMotifs = sorted(canidateMotifs)

    #canidateMotifs = ['NANOG', 'POU5F1', 'SOX2']

    bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '_bg.meme'
    subprocess.call(bgCmd, shell=True)

    utils.formatFolder(projectFolder + 'FIMO/', True)

    fimoCmd = 'fimo'
    for TF in canidateMotifs:
        print TF
        for x in motifDatabaseDict[TF]:
            fimoCmd += ' --motif ' + "'%s'" % (str(x))

    #fimoCmd += ' --thresh 1e-5'
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile ' + projectFolder + projectName + '_bg.meme'
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa'
    fimoCmd += ' > ' + projectFolder + 'FIMO/fimo.txt'  ##
    print fimoCmd

    fimoOutput = subprocess.call(
        fimoCmd, shell=True)  #will wait that fimo is done to go on

    return fimoCmd
Пример #22
0
def filterPeaks(tabixFolder,mycTablePath,outputPath,repeatList = []):

    '''                                                                                             
    auto filters the 3 repeat classes LINE, LTR, Simple_repeat                                      
    outputs a bed in the format of                                                                  
    [PEAK_ID,CHROM, START,STOP,LENGTH, LINE, LTR, Simple_repeat]                                    
    '''

    if len(repeatList) == 0:
        repeatList = ['LINE','LTR','Simple_repeat']

    repeatTable = [['PEAK_ID','CHROM','START','STOP','LENGTH'] + repeatList]

    mycTable = utils.parseTable(mycTablePath,'\t')
    ticker =0
    for line in mycTable[1:]:
        if line[0][0] =='P':
            continue

        if ticker % 100 == 0:
            print ticker
        ticker +=1
        peak_ID = line[0]
        chrom = line[1]
        start = int(line[2])
        stop = int(line[3])
        length = line[4]
        locusString = '%s:%s-%s' % (chrom,start,stop)

        repeatFractions = []
        for repeatClass in repeatList:
            tabixGFF = '%shg19_%s_category_sorted.gff.gz' % (tabixFolder,repeatClass)

            tabixCmd = 'tabix %s %s' % (tabixGFF,locusString)

            tabix = subprocess.Popen(tabixCmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)

            tabixLines = tabix.stdout.readlines()

            tabixLines = [x.rstrip().split('\t') for x in tabixLines] #i think you get back essentially gff lines                                                                                      

            overlapFraction = 0.0
            for line in tabixLines:
                lineStart = int(line[3])
                lineStop = int(line[4])
                lineStart = max(start,lineStart)
                lineStop = min(stop,lineStop)
                overlapLength = lineStop - lineStart
                overlapFraction += float(overlapLength)/float(length)
            repeatFractions.append(round(overlapFraction,4))

        newLine = [peak_ID,chrom,start,stop,length] + repeatFractions
        repeatTable.append(newLine)

    utils.unParseTable(repeatTable,outputPath,'\t')
Пример #23
0
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory,
                         projectName, projectFolder, constExtension):
    '''
    from a BED file of constituents
    generate a FASTA for the consituients contained within the canidate supers
    '''

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [
        utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable
    ]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFtoEnhancerDict.keys():
        subpeakDict[gene] = []
        for region in TFtoEnhancerDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [
                utils.makeSearchLocus(x, constExtension, constExtension)
                for x in overlaps
            ]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append(
                    [overlap.chr(),
                     overlap.start(),
                     overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|' + subpeak.chr() + '|' + str(
                subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(),
                                       int(subpeak.start() + 1),
                                       int(subpeak.end() + 1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')
Пример #24
0
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder,
               analysis_name, motifConvertFile, motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''
    fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True)
    subpeak_name = subpeakFasta.split('/')[-1].split('.')[0]
    output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name)
    # Create a dictionary to call motif names keyed on gene names
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}  #create a dict keyed by TF with multiple motifs

    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    candidate_tf_list.sort()

    print(candidate_tf_list)

    #now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motifDatabaseDict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder, analysis_name)
    fimo_bash = open(fimo_bash_path, 'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimoCmd = 'fimo'
    for motif in motif_list:
        fimoCmd += ' --motif ' + "'%s'" % (str(motif))

    #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile %s' % (bg_path)
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += subpeakFasta
    fimoCmd += ' > ' + output
    print fimoCmd
    fimo_bash.write(fimoCmd)
    fimo_bash.close()

    fimoOutput = subprocess.call(
        fimoCmd, shell=True)  #will wait that fimo is done to go on

    return output
Пример #25
0
def getMedianSignalEnhancer(enhancerFile, name, dataFile):
    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    enhancerVector = [float(line[6]) for line in enhancerTable[6:]]

    median = numpy.median(enhancerVector)

    return median
Пример #26
0
def findMotifs(subpeakFasta,bg_path,candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''
    fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True)
    subpeak_name = subpeakFasta.split('/')[-1].split('.')[0]
    output = '%s%s_fimo.txt'  % (fimoFolder,subpeak_name)
    # Create a dictionary to call motif names keyed on gene names
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs

    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    candidate_tf_list.sort()
    
    print(candidate_tf_list)

    #now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motifDatabaseDict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder,analysis_name)
    fimo_bash = open(fimo_bash_path,'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimoCmd = 'fimo'
    for motif in motif_list:
        fimoCmd += ' --motif ' + "'%s'" % (str(motif))


    #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile %s' % (bg_path)
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += subpeakFasta
    fimoCmd += ' > '+ output
    print fimoCmd
    fimo_bash.write(fimoCmd)
    fimo_bash.close()

    fimoOutput = subprocess.call(fimoCmd, shell=True)  #will wait that fimo is done to go on

    return output
Пример #27
0
def averagingMappedSignal(mapped_list, output_path, setName):
    '''
    averages signal across a set of mapped gffs and writes the new output
    '''

    #create a list containing all of the tables
    table_list = [
        utils.parseTable(mapped_list[i], '\t') for i in range(len(mapped_list))
    ]

    #first set up the output header
    output_header = ['GENE_ID', 'locusLine']
    nCols = len(table_list[0][0]) - 2
    for n in range(nCols):
        output_header.append('bin_%s_%s' % (n + 1, setName))

    output_table = [output_header]
    #now iterate through each row to set up the gene ID and locus line
    for i in range(1, len(table_list[0])):

        line = table_list[0][i]
        if len(line) > 2:
            output_table.append(line[0:2])

    #now run through the whole matrix in i,j notation and put average signal into the final matrix

    #iterate through rows
    row_ticker = 1
    for i in range(1, len(table_list[0])):
        line = table_list[0][i]
        if len(line) == 2:
            continue
        signal_vector = []
        #iterate through columns
        for j in range(2, len(table_list[0][0])):
            try:
                signal_vector = [float(table[i][j]) for table in table_list]
            except IndexError:
                print(i, j)
                print(table_list[0][i])
                print(table_list[1][i])

            signal = max(round(numpy.average(signal_vector), 4), 0)

            output_table[row_ticker].append(signal)
        row_ticker += 1

    print(len(table_list[0]))
    print(len(output_table))
    utils.unParseTable(output_table, output_path, '\t')
    return output_path
Пример #28
0
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path):
    '''
    keyed by probe ID w/ gene as value
    '''
    #see if it already exists
    pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder)
    if utils.checkOutput(pickle_path, 0, 0):
        print('loading previously made probe dict at %s' % (pickle_path))
        probe_gene_dict = pickle.load(open(pickle_path, "rb"))
        return probe_gene_dict

    #we want to intersect refseq common names w/ the array
    startDict = utils.makeStartDict(annotFile)

    ref_name_list = utils.uniquify(
        [startDict[refID]['name'] for refID in startDict.keys()])
    probe_gene_dict = {}

    array_1 = utils.parseTable(array_1_path, '\t')
    array_2 = utils.parseTable(array_2_path, '\t')
    ticker = 0
    for line in array_1 + array_2:
        if len(line) < 5:
            continue
        ticker += 1
        probe_id = line[4]
        name = line[-1]
        # print(probe_id)
        # print(name)
        # if ticker== 10:
        #     sys.exit()
        # print(line)

        if ref_name_list.count(name) > 0:
            probe_gene_dict[probe_id] = name

    pickle.dump(probe_gene_dict, open(pickle_path, 'wb'))
    return probe_gene_dict
Пример #29
0
def makeSignalDict(mappedGFFFile, controlMappedGFFFile=''):
    '''
    makes a signal dict
    '''
    print('\t called makeSignalDict on %s (ctrl: %s)' % (mappedGFFFile, controlMappedGFFFile))
    signalDict = defaultdict(float)

    mappedGFF = utils.parseTable(mappedGFFFile, '\t')
    if len(controlMappedGFFFile) > 0:
        controlGFF = utils.parseTable(controlMappedGFFFile, '\t')

        for i in range(1, len(mappedGFF)):

            signal = float(mappedGFF[i][2]) - float(controlGFF[i][2])
            if signal < 0:
                signal = 0.0
            signalDict[mappedGFF[i][0]] = signal
    else:
        for i in range(1, len(mappedGFF)):
            signal = float(mappedGFF[i][2])
            signalDict[mappedGFF[i][0]] = signal

    return signalDict
Пример #30
0
def getSignalVector(regionFile, name, dataFile):
    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    regionTable = utils.parseTable(regionFile, '\t')

    bamPath = dataDict[name]['bam']
    bamName = bamPath.split('/')[-1]

    colID = regionTable[0].index(bamName)
    signalVector = [float(line[colID]) for line in regionTable[1:]]

    return signalVector
Пример #31
0
def makeSignalDict(mappedGFFFile, controlMappedGFFFile=''):
    '''
    makes a signal dict
    '''
    print('\t called makeSignalDict on %s (ctrl: %s)' % (mappedGFFFile, controlMappedGFFFile))
    signalDict = defaultdict(float)

    mappedGFF = utils.parseTable(mappedGFFFile, '\t')
    if len(controlMappedGFFFile) > 0:
        controlGFF = utils.parseTable(controlMappedGFFFile, '\t')

        for i in range(1, len(mappedGFF)):

            signal = float(mappedGFF[i][2]) - float(controlGFF[i][2])
            if signal < 0:
                signal = 0.0
            signalDict[mappedGFF[i][0]] = signal
    else:
        for i in range(1, len(mappedGFF)):
            signal = float(mappedGFF[i][2])
            signalDict[mappedGFF[i][0]] = signal

    return signalDict
Пример #32
0
def getMedianSignalEnhancer(enhancerFile,name,dataFile):

    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    enhancerVector = [float(line[6]) for line in enhancerTable[6:]]
            

    median= numpy.median(enhancerVector)

    return median
Пример #33
0
def assignEnhancerRank(enhancerToGeneFile,enhancerFile1,enhancerFile2,name1,name2,rankOutput=''):

    '''
    for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables
    '''

    enhancerToGene = utils.parseTable(enhancerToGeneFile,'\t')

    enhancerCollection1 = makeSECollection(enhancerFile1,name1,False)
    enhancerCollection2 = makeSECollection(enhancerFile2,name2,False)

    enhancerDict1 = makeSEDict(enhancerFile1,name1,False)
    enhancerDict2 = makeSEDict(enhancerFile2,name2,False)

    
    #we're going to update the enhancerToGeneTable

    enhancerToGene[0] += ['%s_rank' % name1,'%s_rank' % name2]
    
    for i in range(1,len(enhancerToGene)):

        line = enhancerToGene[i]
        
        locusLine = utils.Locus(line[1],line[2],line[3],'.',line[0])
        
        #if the enhancer doesn't exist, its ranking is dead last on the enhancer list

        enhancer1Overlap = enhancerCollection1.getOverlap(locusLine,'both')
        if len(enhancer1Overlap) == 0:
            enhancer1Rank = len(enhancerCollection1)
        else:
            
            rankList1 = [enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap]
            enhancer1Rank = min(rankList1)


        enhancer2Overlap = enhancerCollection2.getOverlap(locusLine,'both')
        if len(enhancer2Overlap) == 0:
            enhancer2Rank = len(enhancerCollection2)
        else:
            
            rankList2 = [enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap]
            enhancer2Rank = min(rankList2)
        enhancerToGene[i]+=[enhancer1Rank,enhancer2Rank]


    if len(rankOutput) == 0:
        return enhancerToGene
    else:
        utils.unParseTable(enhancerToGene,rankOutput,'\t')
Пример #34
0
def makePeakGFFs(peak_path_list):

    '''
    makes a stitched gff for all MYC bound TSS and Distal regions across all datasets
    '''

    #setting the output
    tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder)
    distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder)

    #check to see if already done
    if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1):
        print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path))
        return tss_gff_path,distal_gff_path

    #emtpy loci lists to hold everything
    tss_loci = []
    distal_loci = []

    
    for peak_path in peak_path_list:
        print('processing %s' % (peak_path))

        peak_table=  utils.parseTable(peak_path,'\t')

        for line in peak_table[1:]:
            peak_locus = utils.Locus(line[1],line[2],line[3],'.')
            if int(line[5]) == 0:
                distal_loci.append(peak_locus)
            else:
                tss_loci.append(peak_locus)

    #now combind the loci
    print('stitching loci')
    distal_collection = utils.LocusCollection(distal_loci,50)
    tss_collection = utils.LocusCollection(tss_loci,50)

    stitched_distal_collection = distal_collection.stitchCollection()
    stitched_tss_collection = tss_collection.stitchCollection()

    #now make the gffs
    distal_gff= utils.locusCollectionToGFF(distal_collection)
    tss_gff= utils.locusCollectionToGFF(tss_collection)

    #now write to disk
    utils.unParseTable(distal_gff,distal_gff_path,'\t')
    utils.unParseTable(tss_gff,tss_gff_path,'\t')
    
    return tss_gff_path,distal_gff_path
Пример #35
0
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName,
                    genome, parentFolder):
    '''
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    '''
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (
        parentFolder, string.upper(genome), mergeName)

    #check to make sure this hasn't been done yet
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (
        parentFolder, name1, string.upper(genome), mergeName)

    try:
        foo = utils.parseTable(roseOutput, '\t')
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    except IOError:

        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,
                                                           superFile2)
        mergedGFF = mergeCollections(superFile1, superFile2, name1, name2,
                                     mergedGFFFile)

        #call rose on the merged shit

        roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2,
                                      parentFolder)
        print('i can has rose bash file %s' % (roseBashFile))

        #run the bash command
        os.system('bash %s' % (roseBashFile))

        #check for and return output
        if utils.checkOutput(roseOutput, 1, 10):
            return roseOutput
        else:
            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (parentFolder, name1)
            roseFileList = [x for x in os.listdir(roseFolder)
                            if x[0] != '.']  #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()

            enhancerToGeneFile = getFile(
                '_SuperEnhancers_ENHANCER_TO_GENE.txt', roseFileList,
                roseFolder)
Пример #36
0
def getSignalVector(regionFile,name,dataFile):

    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    regionTable = utils.parseTable(regionFile,'\t')

    bamPath = dataDict[name]['bam']
    bamName = bamPath.split('/')[-1]

    colID = regionTable[0].index(bamName)
    signalVector = [float(line[colID]) for line in regionTable[1:]]
            
    return signalVector
Пример #37
0
def makeAnnotDict(annotFile):
    '''
    makes a dictionary keyed by guideID
    '''

    guideDict = defaultdict(str)
    geneDict = defaultdict(list)

    geckoAnnot = utils.parseTable(annotFile, '\t')

    for line in geckoAnnot[1:]:
        guideDict[line[1]] = line[0]
        geneDict[line[0]].append(line[1])

    return guideDict, geneDict
Пример #38
0
def makeAnnotDict(annotFile):

    '''
    makes a dictionary keyed by guideID
    '''

    guideDict = defaultdict(str)
    geneDict = defaultdict(list)

    geckoAnnot = utils.parseTable(annotFile,'\t')
    
    for line in geckoAnnot[1:]:
        guideDict[line[1]] = line[0]
        geneDict[line[0]].append(line[1])

    return guideDict,geneDict
Пример #39
0
def writeSplitBeds(bed, analysisName, outputFolder, window=50, centered=False):

    dmrList = [x for x in utils.parseTable(bed, '\t')]
    #    print len(refGenes)

    dmrBed = []
    endsBed = []
    startsBed = []
    centeredBed = []

    if centered == False:

        for line in dmrList:
            dmrID = line[3]

            dmrCoords = [line[0], int(line[1]), int(line[2]), dmrID]

            dmrBed.append(dmrCoords)
            startExtend = [line[0], int(line[1]) - window, int(line[1]), dmrID]
            endExtend = [line[0], int(line[2]), int(line[2]) + window, dmrID]
            endsBed.append(endExtend)
            startsBed.append(startExtend)

        print len(dmrBed)
        utils.unParseTable(dmrBed,
                           outputFolder + analysisName + '_BODY_-0_+0.bed',
                           '\t')
        print len(startsBed)
        utils.unParseTable(
            startsBed, outputFolder + analysisName + '_UPSTREAM_-' +
            str(window) + '_+' + str(window) + '.bed', '\t')
        print len(endsBed)
        utils.unParseTable(
            endsBed, outputFolder + analysisName + '_DOWNSTREAM_-' +
            str(window) + '_+' + str(window) + '.bed', '\t')

    elif centered == True:
        for line in dmrList:
            dmrID = line[3]

            center = (int(line[1]) + int(line[2])) / 2
            centeredBed.append(
                [line[0], center - window, center + window, dmrID])

        utils.unParseTable(
            centeredBed, outputFolder + analysisName + '_CENTERED_-' +
            str(window) + '_+' + str(window) + '.bed', '\t')
Пример #40
0
def make_mycn_regions(conserved_rank_path):

    '''
    takes conserved NB MYCN regions 
    then creates a bed and gff of regions
    '''

    conserved_rank_table = utils.parseTable(conserved_rank_path,'\t')
    mycn_gff = []
    mycn_flank_gff = []
    mycn_bed = []
    mycn_flank_bed = []

    for line in conserved_rank_table[1:]:
        locus_line = utils.Locus(line[1],line[2],line[3],'.')
        
        if int(line[3]) < int(line[2]):
            print('uh oh')
            print(line)
        gff_line = [line[1],line[0],'',line[2],line[3],'','.','',line[0]]
        bed_line = [line[1],line[2],line[3],line[0]]
        mycn_gff.append(gff_line)
        mycn_bed.append(bed_line)

        gff_flank_line = [line[1],line[0],'',int(line[2])-500,int(line[3])+500,'','.','',line[0]]
        bed_flank_line = [line[1],int(line[2])-500,int(line[3])+500,line[0]]
        mycn_flank_gff.append(gff_flank_line)
        mycn_flank_bed.append(bed_flank_line)
        
    mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    mycn_flank_gff_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder)

    mycn_bed_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.bed' % (bedFolder)
    mycn_flank_bed_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.bed' % (bedFolder)

    #writing to disk
    utils.unParseTable(mycn_gff,mycn_gff_path,'\t')
    utils.unParseTable(mycn_flank_gff,mycn_flank_gff_path,'\t')

    utils.unParseTable(mycn_bed,mycn_bed_path,'\t')
    utils.unParseTable(mycn_flank_bed,mycn_flank_bed_path,'\t')

    print(mycn_gff_path)
    print(mycn_flank_gff_path)
    print(mycn_bed_path)
    print(mycn_flank_bed_path)
    return mycn_gff_path,mycn_flank_gff_path
Пример #41
0
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder):

    """
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    """
    mergedGFFFile = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % (parentFolder, string.upper(genome), mergeName)

    # check to make sure this hasn't been done yet
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (
        parentFolder,
        name1,
        string.upper(genome),
        mergeName,
    )

    try:
        foo = utils.parseTable(roseOutput, "\t")
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    except IOError:

        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2)
        mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile)

        # call rose on the merged shit

        roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder)
        print ("i can has rose bash file %s" % (roseBashFile))

        # run the bash command
        os.system("bash %s" % (roseBashFile))

        # check for and return output
        if utils.checkOutput(roseOutput, 1, 10):
            return roseOutput
        else:
            # try finding it w/ a different name
            # this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (parentFolder, name1)
            roseFileList = [x for x in os.listdir(roseFolder) if x[0] != "."]  # no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()

            enhancerToGeneFile = getFile("_SuperEnhancers_ENHANCER_TO_GENE.txt", roseFileList, roseFolder)
Пример #42
0
def getMedianSignal(enhancerFile, name, dataFile):

    """
    returns the median enhancer signal of a file
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile, "\t")

    backgroundName = dataDict[name]["background"]
    if dataDict.has_key(backgroundName):
        enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]]
    else:
        enhancerVector = [float(line[6]) for line in enhancerTable[6:]]

    median = numpy.median(enhancerVector)

    return median
Пример #43
0
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension):
    '''
    takes as input a BED file of constituents
    outputs a FASTA  file of merged extended super-enhancer consituents and associated formated name
    '''

    print 'MAKE FASTA'

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFandSuperDict.keys():
        subpeakDict[gene] = []
        for region in TFandSuperDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    # Output the fasta file of extended SE constituents
    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')
Пример #44
0
def makeBedCollection(bedFileList):
    '''
    takes in a list of bedFiles and makes a single huge collection
    each locus has as its ID the name of the bed file
    '''

    bedLoci = []
    print("MAKING BED COLLECTION FOR:")
    for bedFile in bedFileList:

        bedName = bedFile.split('/')[-1].split('.')[0]
        print(bedName)
        bed = utils.parseTable(bedFile, '\t')
        for line in bed:
            bedLocus = utils.Locus(line[0], line[1], line[2], '.', bedName)
            bedLoci.append(bedLocus)

    return utils.LocusCollection(bedLoci, 50)
Пример #45
0
def getMedianSignal(enhancerFile, name, dataFile):
    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    backgroundName = dataDict[name]['background']
    if dataDict.has_key(backgroundName):
        enhancerVector = [
            float(line[6]) - float(line[7]) for line in enhancerTable[6:]
        ]
    else:
        enhancerVector = [float(line[6]) for line in enhancerTable[6:]]

    median = numpy.median(enhancerVector)

    return median
def getExpanded(locusTable, expansion, status, output):

    loci = utils.parseTable(locusTable, '\t')
    expandedList = []
    for line in loci:
        wtLocus = line[0:4]
        mutLocus = line[4:8]
        if status == 'WT':
            newLine = expansionStat(wtLocus, mutLocus, expansion=0.1)
            if len(newLine) > 0:
                expandedList.append(newLine)
        elif status == 'MUT':
            newLine = expansionStat(mutLocus, wtLocus, expansion=0.1)
            if len(newLine) > 0:
                expandedList.append(newLine)

    print len(expandedList), ' expanded loci in ', status
    utils.unParseTable(expandedList, output, '\t')
Пример #47
0
def findMotifs(candidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile):
    '''Run the motif search on the extended SE constituents with FIMO
    '''

    print 'MOTIF SEARCH'

    # Create a dictionary of motif keyed on each TF
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}
    motifNames = [line[1] for line in motifDatabase]
    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    canidateMotifs = []
    for gene in candidateGenes:
        if gene in motifNames:
            canidateMotifs.append(gene)

    print 'Number of annotated candidate TFs that have motifs: ' + str(len(canidateMotifs))
    canidateMotifs = sorted(canidateMotifs)

    # Create a backgroud sequence file to use with FIMO
    bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '_bg.meme'
    call(bgCmd, shell=True)

    # Run the motif search with FIMO
    fimoCmd = 'fimo'
    for motif in canidateMotifs:
        for x in motifDatabaseDict[motif]:
            fimoCmd += ' --motif ' + "'%s'" % (str(x))
    fimoCmd += ' -verbosity 1'
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder
    fimoCmd += ' --bgfile ' + projectFolder + projectName + '_bg.meme'
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa'
    fimoCmd += ' > '+ projectFolder + 'fimo.txt'
    print fimoCmd

    fimoOutput = call(fimoCmd, shell=True)

    return fimoCmd
Пример #48
0
def filterGFF(gffFile,chromList):

    '''
    takes in a gff and filters out all lines that don't belong to a chrom in the chromList
    '''
    gff = utils.parseTable(gffFile,'\t')
    filteredGFF = []
    excludeList=[]
    for line in gff:
        if chromList.count(line[0]) ==1:
            filteredGFF.append(line)
        else:
            excludeList.append(line[0])

    excludeList = utils.uniquify(excludeList)
    if len(excludeList) > 0:
        print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList)))

    return filteredGFF
Пример #49
0
def makeSECollection(enhancerFile,name,superOnly = True):
    '''
    returns a locus collection from a super table
    top gives the number of rows
    '''
    enhancerTable = utils.parseTable(enhancerFile,'\t')
    enhancerLoci = []


    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue
        else:

            if superOnly and int(line[-1]) == 0:
                break
            enhancerLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0]))

    return utils.LocusCollection(enhancerLoci,50)
Пример #50
0
def makeMedianDict(nameDict):

    '''
    for each dataset returns the median background subtracted enhancer signal
    '''

    medianDict = {}

    for name in nameDict:

        #open up the allenhancerTable
        enhancerTable = utils.parseTable(nameDict[name]['enhancerFile'],'\t')

        #assume header ends after line 5
        enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]]

        medianDict[name] = numpy.median(enhancerVector)

    return medianDict
Пример #51
0
def makeSECollection(enhancerFile, name, top=0):
    """
    returns a locus collection from a super table
    top gives the number of rows
    """
    enhancerTable = utils.parseTable(enhancerFile, "\t")
    superLoci = []

    ticker = 0
    for line in enhancerTable:
        if line[0][0] == "#" or line[0][0] == "R":
            continue
        else:
            ticker += 1

            superLoci.append(utils.Locus(line[1], line[2], line[3], ".", name + "_" + line[0]))

            if ticker == top:
                break
    return utils.LocusCollection(superLoci, 50)
Пример #52
0
def makeSECollection(enhancerFile,name,top=0):
    '''
    returns a locus collection from a super table
    top gives the number of rows
    '''
    enhancerTable = utils.parseTable(enhancerFile,'\t')
    superLoci = []

    ticker = 0
    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue
        else:
            ticker+=1

            superLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0]))

            if ticker == top:
                break
    return utils.LocusCollection(superLoci,50)
Пример #53
0
def generateSubpeakFASTA(gene_to_enhancer_dict, subpeaks, genome, projectName, projectFolder, constExtension):
    '''
    from a BED file of constituents
    generate a FASTA for the consituients contained within the canidate supers
    '''
    genomeDirectory = genome.directory()
    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)


    for gene in gene_to_enhancer_dict.keys():
        subpeakDict[gene] = []
        for region in gene_to_enhancer_dict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)


    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), 
                                       int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(string.upper(fastaLine))


    return subpeakBED,fasta
Пример #54
0
def makeRigerTable(foldTableFile,output=''):

    '''
    blah
    '''

    #need a table of this format
    rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']]
    #set weight to 1 for now

    foldTable = utils.parseTable(foldTableFile,'\t')

    constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True)

    #make geneCountDict
    print("making gene count dictionary")
    geneCountDict= defaultdict(int)
    for line in foldTable[1:]:
        geneCountDict[line[1]] +=1

    print("iterating through constructs")
    constructRank = 1
    for i in constructOrder:
        rowIndex = i+1 # accounts for the header
        geneName = foldTable[rowIndex][1]
        if geneCountDict[geneName] == 1:
            print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName))
            continue

        newLine = foldTable[rowIndex][0:3] + [constructRank,1]
        rigerTable.append(newLine)
        constructRank += 1

    if len(output) == 0:
        output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt')
    
    utils.unParseTable(rigerTable,output,'\t')

    return output
Пример #55
0
def collapseRegionMap(regionMapFile,name='',controlBams=False):

    '''
    takes a regionMap file and collapses signal into a single column
    also fixes any stupid start/stop sorting issues
    needs to take into account whether or not controls were used
    '''

    regionMap = utils.parseTable(regionMapFile,'\t')

    for n,line in enumerate(regionMap):
        
        if n ==0:
            #new header
            if len(name) == 0:
                name = 'MERGED_SIGNAL'
            regionMap[n] = line[0:6] +[name]

        else:
            newLine = list(line[0:6])
            if controlBams:
                signalLine = [float(x) for x in line[6:]]
                rankbyIndexes = range(0,len(signalLine)/2,1)
                controlIndexes = range(len(signalLine)/2,len(signalLine),1)
                metaVector = []
                for i,j in zip(rankbyIndexes,controlIndexes):
                    #min signal is 0
                    metaVector.append(max(0,signalLine[i] - signalLine[j]))
                metaSignal = numpy.mean(metaVector)
            else:
                metaSignal = numpy.mean([float(x) for x in line[6:]])
            regionMap[n] = newLine + [metaSignal]

    outputFile = string.replace(regionMapFile,'REGION','META')
    utils.unParseTable(regionMap,outputFile,'\t')
    return(outputFile)
Пример #56
0
import utils
from sys import argv


filename = argv[1]
outname = filename[:-3] + 'sorted.bed'


bedfile = utils.parseTable(filename, '\t')
out = []
for line in bedfile:

    coords = [int(line[1]), int(line[2])]
    start = min(coords)
    end = max(coords)

    newline = [line[0], start, end] + line[3:]
    out.append(newline)

utils.unParseTable(out, outname, '\t')
    
Пример #57
0
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName):
    '''
    makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
    '''

    print('FORMATTING TABLE')
    loci = stitchedCollection.getLoci()

    locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']]

    lociLenList = []

    # strip out any that are in chrY
    for locus in list(loci):
        if locus.chr() == 'chrY':
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.ID().split('_')[1]))
        lociLenList.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    lenOrder = utils.order(lociLenList, decreasing=True)
    ticker = 0
    for i in lenOrder:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        refEnrichSize = 0
        refOverlappingLoci = referenceCollection.getOverlap(locus, 'both')
        for refLocus in refOverlappingLoci:
            refEnrichSize += refLocus.len()

        try:
            stitchCount = int(locus.ID().split('_')[0])
        except ValueError:
            stitchCount = 1
        coords = [int(x) for x in locus.coords()]

        locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize])

    print('GETTING MAPPED DATA')
    print("USING A BAMFILE LIST:")
    print(bamFileList)
    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        print('GETTING MAPPING DATA FOR  %s' % bamFile)
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName))

        mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t')

        signalDict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
        mappedLoci = []
        for line in mappedGFF[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0]))
            try:
                signalDict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue

        mappedCollection = utils.LocusCollection(mappedLoci, 500)
        locusTable[0].append(bamFileName)

        for i in range(1, len(locusTable)):
            signal = 0.0
            line = locusTable[i]
            lineLocus = utils.Locus(line[1], line[2], line[3], '.')
            overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both')
            for region in overlappingRegions:
                signal += signalDict[region.ID()]
            locusTable[i].append(signal)

    utils.unParseTable(locusTable, output, '\t')
Пример #58
0
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1,bam2)
        nameString = "%s,%s" % (name1,name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)
    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Пример #59
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #60
0
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000):

    """
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    # making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    # make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True)

    # bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, "\t")

    # make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formattedRankTable = [header]

    # the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    # the beds
    gainedTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0'
        % (genome, name2, genome, name2, name1)
    )
    gainedBed = [[gainedTrackHeader]]
    conservedTrackHeader = (
        'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0'
        % (genome, name1, name2, genome, name1, name2)
    )
    conservedBed = [[conservedTrackHeader]]

    lostTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0'
        % (genome, name1, genome, name1, name2)
    )
    lostBed = [[lostTrackHeader]]

    # the genes
    geneTable = [
        [
            "GENE",
            "ENHANCER_ID",
            "ENHANCER_CHROM",
            "ENHANCER_START",
            "ENHANCER_STOP",
            header[6],
            header[7],
            header[8],
            "STATUS",
        ]
    ]

    for line in rankEnhancerTable[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formattedRankTable.append(line)

        # getting the genes
        geneList = []
        geneList += line[9].split(",")
        geneList += line[10].split(",")
        geneList += line[11].split(",")
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ",")

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        # for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        # for conserved
        else:
            geneStatus = "CONSERVED"
            conservedBed.append(bedLine)

        # now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus]
            geneTable.append(geneTableLine)

    # concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(formattedRankTable, formattedFilename, "\t")

    # gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2))
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name2),
        window / 1000,
        window / 1000,
    )

    gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1))
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name1),
        window / 1000,
        window / 1000,
    )

    utils.unParseTable(gainedGFF, gffFilename_gained, "\t")
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t")

    utils.unParseTable(lostGFF, gffFilename_lost, "\t")
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t")

    # bed
    bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName)
    utils.unParseTable(fullBed, bedFilename, "\t")

    # geneTable
    geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(geneTable, geneFilename, "\t")

    # finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % (
        mergeFolder,
        name1,
        outputFolder,
        genome,
        mergeName,
    )
    os.system(cmd)

    # now execute the bamPlot_turbo.py commands
    bam1 = dataDict[name1]["bam"]
    bam2 = dataDict[name2]["bam"]
    bamString = "%s,%s" % (bam1, bam2)
    nameString = "%s,%s" % (name1, name2)
    colorString = "0,0,0:100,100,100"

    # change dir
    os.chdir("/ark/home/cl512/pipeline/")

    if len(gainedGFF) > 0:
        # gained command
        plotTitle = "%s_ONLY_SE" % (name2)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # gained window command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    if len(lostGFF) > 0:
        # lost command
        plotTitle = "%s_ONLY_SE" % (name1)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # lost command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    return