Пример #1
0
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''):
    '''
    input: an activity table with refseq in first column and expression or promoter
    acetylation in second column
    output: a dictionary keyed by refseq that points to activity
    '''

    print 'CREATING EXPRESSION DICTIONARY'

    if not expressionFile:
        expressionFilename = projectFolder + 'bamliquidator/matrix.txt'
    else:
        expressionFilename = expressionFile
        
    expressionTable = utils.parseTable(expressionFilename, '\t')

    expressionDictNM = {}
    expressionDictGene = {}

    for line in expressionTable[1:]:
        trid = line[0]
        geneName = refseqToNameDict[trid]
        try:
            exp = float(line[2])
        except IndexError:
            exp = float(line[1])

        # Save the expression value of each NMid in a dict, keep higher value if multiple
        if trid in expressionDictNM and exp > expressionDictNM[trid]:
            expressionDictNM[trid] = exp
        elif trid not in expressionDictNM:
            expressionDictNM[trid] = exp

        # Save the value of the expression if it's the highest for that gene
        if geneName in expressionDictGene and exp > expressionDictGene[geneName]:
            expressionDictGene[geneName] = exp
        elif geneName not in expressionDictGene:
            expressionDictGene[geneName] = exp

    cutoff = numpy.percentile(expressionDictGene.values(), expCutoff)
    print 'Expression cutoff: ' + str(cutoff)

    expressedGenes = []
    expressedNM = []

    for nmid in expressionDictNM:
        if float(expressionDictNM[nmid]) > cutoff:
            expressedGenes.append(refseqToNameDict[nmid])
            expressedNM.append(nmid)

    expressedGenes = utils.uniquify(expressedGenes)
    Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt'
    utils.unParseTable(expressedGenes, Genefilename, '')

    expressedNM = utils.uniquify(expressedNM)
    NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt'
    utils.unParseTable(expressedNM, NMfilename, '')
      
    return expressedNM, expressionDictNM
Пример #2
0
def getBamChromList(bamFileList):
    '''
    gets the consensus list of chromosomes mapped by the bams
    '''

    #start w/ the first bam
    cmd = '%s idxstats %s' % (samtoolsPath, bamFileList[0])
    idxStats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    idxStats = idxStats.communicate()
    finalChromList = [
        line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]
    ]

    #now go through each additional bam
    for bamFile in bamFileList:
        cmd = '%s idxstats %s' % (samtoolsPath, bamFile)
        idxStats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        idxStats = idxStats.communicate()
        chromList = [
            line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]
        ]
        finalChromList = [
            chrom for chrom in finalChromList if chromList.count(chrom) != 0
        ]

    return utils.uniquify(finalChromList)
Пример #3
0
 def __init__(self,
              poll_id: str,
              poll_title: str,
              options: List[Any],
              is_immortal=False,
              updated_since_start=True):
     if options is None:
         options = []
     self.poll_id = poll_id
     self.creation_time = time.time()
     self.last_update = time.time()
     self.poll_title = poll_title
     self.options = uniquify(options)
     self.reaction_to_option = {
         reaction_emojies[k]: options[k]
         for k in range(len(options))
     }
     self.option_to_reaction = {
         options[k]: reaction_emojies[k]
         for k in range(len(options))
     }
     self.participants = dict()
     self.option_to_participants = {key: [] for key in options}
     self.sent_message = None
     self.received_message = None
     self.is_immortal = is_immortal
     self.is_enabled = True
     self.updated_since_start = updated_since_start
Пример #4
0
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3],
                                     gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]),
                           gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?)

    overlapLoci = bedCollection.getOverlap(gffLocus, sense='both')
    print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" %
          (len(overlapLoci), gffLine))

    # since beds come from multiple sources, we want to figure out how to offset them
    offsetDict = {}  # this will store each ID name
    bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci])
    bedNamesList.sort()
    for i in range(len(bedNamesList)):
        offsetDict[bedNamesList[
            i]] = 2 * i  # offsets different categories of bed regions

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])

    # fill out the name table
    for name in bedNamesList:
        offset = offsetDict[name]
        nameTable.append([name, 0, 0.0 - offset])

    for bedLocus in overlapLoci:

        offset = offsetDict[bedLocus.ID()]

        [start,
         stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()]

        diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unParseTable(diagramTable,
                       outFolder + gffString + '_bedDiagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt',
                       '\t')
Пример #5
0
def splitRegions(inputGFF,tssCollection):


    #if even a single coordinate is shared with the +/-1kb 
    splitGFF = []
    for line in inputGFF:

        chrom = line[0]
        regionID = line[1]
        lineLocus = utils.Locus(line[0],line[3],line[4],'.')

        overlappingLoci = tssCollection.getOverlap(lineLocus)
        if len(overlappingLoci) > 0: #case where a tss Overlap
            #identify the parts of the line locus that are contained
            localTSSCollection = utils.LocusCollection(overlappingLoci,50)
            overlappingCoords = lineLocus.coords()
            for tssLocus in overlappingLoci:
                overlappingCoords += tssLocus.coords()

            overlappingCoords = utils.uniquify(overlappingCoords)
            overlappingCoords.sort()

            #you need to hack and slash add 1 to the last coordinate of the overlappingCoords
            overlappingCoords[-1] +=1

            i = 0
            regionTicker = 1
            while i < (len(overlappingCoords)-1):
                start = int(overlappingCoords[i])
                stop = int(overlappingCoords[(i+1)])-1
                if (stop - start) < 50: #this eliminates really tiny regions
                    i+=1
                    continue
                splitLocus = utils.Locus(chrom,start+1,stop,'.')


                if lineLocus.overlaps(splitLocus): #has to be a mycn site
                    newID = '%s_%s' % (regionID,regionTicker)
                    tssStatus = 0
                    if localTSSCollection.getOverlap(splitLocus):
                        tssStatus = 1
                    splitGFFLine = [chrom,newID,newID,start,stop,'','.',tssStatus,newID]

                    splitGFF.append(splitGFFLine)
                    regionTicker+=1
                i+=1
        else:
            line[7] = 0
            splitGFF.append(line)




    return splitGFF
Пример #6
0
def get_all_os_versions():
   """
   Collapse the above list of OS versions for usage/display by the CLI/webapp.
   """
   results = ['']
   for x in VALID_OS_VERSIONS.keys():
      for y in VALID_OS_VERSIONS[x]:
         results.append(y)
   results = utils.uniquify(results)
   results.sort()
   return results
Пример #7
0
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder,
               analysis_name, motifConvertFile, motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''
    fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True)
    subpeak_name = subpeakFasta.split('/')[-1].split('.')[0]
    output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name)
    # Create a dictionary to call motif names keyed on gene names
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}  #create a dict keyed by TF with multiple motifs

    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    candidate_tf_list.sort()

    print(candidate_tf_list)

    #now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motifDatabaseDict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder, analysis_name)
    fimo_bash = open(fimo_bash_path, 'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimoCmd = 'fimo'
    for motif in motif_list:
        fimoCmd += ' --motif ' + "'%s'" % (str(motif))

    #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile %s' % (bg_path)
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += subpeakFasta
    fimoCmd += ' > ' + output
    print fimoCmd
    fimo_bash.write(fimoCmd)
    fimo_bash.close()

    fimoOutput = subprocess.call(
        fimoCmd, shell=True)  #will wait that fimo is done to go on

    return output
Пример #8
0
def findMotifs(subpeakFasta,bg_path,candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''
    fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True)
    subpeak_name = subpeakFasta.split('/')[-1].split('.')[0]
    output = '%s%s_fimo.txt'  % (fimoFolder,subpeak_name)
    # Create a dictionary to call motif names keyed on gene names
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs

    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    candidate_tf_list.sort()
    
    print(candidate_tf_list)

    #now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motifDatabaseDict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder,analysis_name)
    fimo_bash = open(fimo_bash_path,'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimoCmd = 'fimo'
    for motif in motif_list:
        fimoCmd += ' --motif ' + "'%s'" % (str(motif))


    #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile %s' % (bg_path)
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += subpeakFasta
    fimoCmd += ' > '+ output
    print fimoCmd
    fimo_bash.write(fimoCmd)
    fimo_bash.close()

    fimoOutput = subprocess.call(fimoCmd, shell=True)  #will wait that fimo is done to go on

    return output
Пример #9
0
def checkRefCollection(referenceCollection):
    '''
    makes sure the names of all loci in the reference collection are unique
    '''

    namesList = [locus.ID() for locus in referenceCollection.getLoci()]

    if len(namesList) != len(utils.uniquify(namesList)):
        print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS")
        sys.exit()
    else:
        print("REFERENCE COLLECTION PASSES QC")
        return
Пример #10
0
def checkRefCollection(referenceCollection):

    '''
    makes sure the names of all loci in the reference collection are unique
    '''

    namesList = [locus.ID() for locus in referenceCollection.getLoci()]
    
    if len(namesList) != len(utils.uniquify(namesList)):
        print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS")
        sys.exit()
    else:
        print("REFERENCE COLLECTION PASSES QC")
        return
Пример #11
0
def addLengths(gene_table_path, peak_table_path):
    '''
    add tss and distal lengths to a gene table
    using the peak table
    '''

    output_path = string.replace(gene_table_path, 'GENE_TABLE',
                                 'GENE_TABLE_LENGTH')

    print(output_path)

    tss_dict = defaultdict(int)
    distal_dict = defaultdict(int)

    peak_table = utils.parseTable(peak_table_path, '\t')
    for line in peak_table[1:]:
        #get the genes
        gene_list = []
        if len(line) == 15:
            gene_list += line[-1].split(',')
            gene_list += line[-2].split(',')
        elif len(line) == 14:
            gene_list += line[-1].split(',')
        else:
            continue

        gene_list = utils.uniquify(
            [gene for gene in gene_list if len(gene) > 0])

        for gene in gene_list:
            if int(line[5]) == 1:
                tss_dict[gene] += int(line[4])
            else:
                distal_dict[gene] += int(line[4])

    #now fill out the gene table
    gene_table = utils.parseTable(gene_table_path, '\t')

    output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']]

    for line in gene_table[1:]:
        gene = line[0]
        new_line = line + [tss_dict[gene], distal_dict[gene]]
        output_table.append(new_line)

    utils.unParseTable(output_table, output_path, '\t')

    return output_path
Пример #12
0
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?)

    overlapLoci = bedCollection.getOverlap(gffLocus, sense='both')
    print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci),gffLine))

    # since beds come from multiple sources, we want to figure out how to offset them
    offsetDict = {}  # this will store each ID name
    bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci])
    bedNamesList.sort()
    for i in range(len(bedNamesList)):
        offsetDict[bedNamesList[i]] = 2 * i  # offsets different categories of bed regions

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])

    # fill out the name table
    for name in bedNamesList:
        offset = offsetDict[name]
        nameTable.append([name, 0, 0.0 - offset])

    for bedLocus in overlapLoci:

        offset = offsetDict[bedLocus.ID()]

        [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()]

        diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
Пример #13
0
def filterGFF(gffFile, chromList):
    '''
    takes in a gff and filters out all lines that don't belong to a chrom in the chromList
    '''
    gff = utils.parseTable(gffFile, '\t')
    filteredGFF = []
    excludeList = []
    for line in gff:
        if chromList.count(line[0]) == 1:
            filteredGFF.append(line)
        else:
            excludeList.append(line[0])

    excludeList = utils.uniquify(excludeList)
    if len(excludeList) > 0:
        print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" %
              (','.join(excludeList)))

    return filteredGFF
Пример #14
0
def filterGFF(gffFile,chromList):

    '''
    takes in a gff and filters out all lines that don't belong to a chrom in the chromList
    '''
    gff = utils.parseTable(gffFile,'\t')
    filteredGFF = []
    excludeList=[]
    for line in gff:
        if chromList.count(line[0]) ==1:
            filteredGFF.append(line)
        else:
            excludeList.append(line[0])

    excludeList = utils.uniquify(excludeList)
    if len(excludeList) > 0:
        print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList)))

    return filteredGFF
Пример #15
0
def getBamChromList(bamFileList):

    '''
    gets the consensus list of chromosomes mapped by the bams
    '''
    
    #start w/ the first bam
    cmd = '%s idxstats %s' % (samtoolsPath,bamFileList[0])
    idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
    idxStats= idxStats.communicate()
    finalChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
    
    #now go through each additional bam
    for bamFile in bamFileList:
        cmd = '%s idxstats %s' % (samtoolsPath,bamFile)
        idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
        idxStats= idxStats.communicate()
        chromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
        finalChromList = [chrom for chrom in finalChromList if chromList.count(chrom) != 0]

    return utils.uniquify(finalChromList)
Пример #16
0
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path):
    '''
    keyed by probe ID w/ gene as value
    '''
    #see if it already exists
    pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder)
    if utils.checkOutput(pickle_path, 0, 0):
        print('loading previously made probe dict at %s' % (pickle_path))
        probe_gene_dict = pickle.load(open(pickle_path, "rb"))
        return probe_gene_dict

    #we want to intersect refseq common names w/ the array
    startDict = utils.makeStartDict(annotFile)

    ref_name_list = utils.uniquify(
        [startDict[refID]['name'] for refID in startDict.keys()])
    probe_gene_dict = {}

    array_1 = utils.parseTable(array_1_path, '\t')
    array_2 = utils.parseTable(array_2_path, '\t')
    ticker = 0
    for line in array_1 + array_2:
        if len(line) < 5:
            continue
        ticker += 1
        probe_id = line[4]
        name = line[-1]
        # print(probe_id)
        # print(name)
        # if ticker== 10:
        #     sys.exit()
        # print(line)

        if ref_name_list.count(name) > 0:
            probe_gene_dict[probe_id] = name

    pickle.dump(probe_gene_dict, open(pickle_path, 'wb'))
    return probe_gene_dict
Пример #17
0
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False):
    
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    #internal parameter for debugging
    byRefseq = False


    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile,'\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes)


    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0))


    #this turns the tssLoci list into a LocusCollection
    #50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = utils.LocusCollection(tssLoci,50)

    

    geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)}

    #dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict= defaultdict(list)

    #list of all genes that appear in this analysis
    overallGeneList = []

    if noFormatTable:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]

        
    else:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]

        #next by gene
        geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]

    #next make the gene to enhancer table
    geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]

        


    for line in enhancerTable:
        if line[0][0] =='#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1],line[2],line[3])
        
        enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0])

        #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus         
        overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both')           
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:                
            overlappingGenes.append(overlapLocus.ID())

        #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both')           
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())


        distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both')           
        distalGenes =[]
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

            
            
        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        #these checks make sure each gene list is unique.
        #technically it is possible for a gene to be overlapping, but not proximal since the
        #gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)


        #Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            #get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3]))/2

            #get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            #get the ID and convert to name
            closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']

        #NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        #Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList +=overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))
            
        overallGeneList+=proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))



    #End loop through
    
    #Make table by gene
    overallGeneList = utils.uniquify(overallGeneList)  

    #use enhancer rank to order
    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])
        
    usedNames = []
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)
        
        proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID]
        
        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]],',')
    
        newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus]
        geneToEnhancerTable.append(newLine)

    #resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable,geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i+1)])

        return sortedTable,geneToEnhancerTable
Пример #18
0
 def _matrix_entry_maps(self):
     """Set of all mappings used in matrix arguments."""
     return list(uniquify(m for arg in self.args if arg._is_mat for m in arg.map))
Пример #19
0
 def _unique_matrix(self):
     return list(uniquify(a.data for a in self._matrix_args))
Пример #20
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #21
0
def model_prediction():

    model_path = 'networks/resnet-18_trained.t7'
    test_dir = 'Result/'
    cuda = True
    log_file = 'evalss_data'
    eval_rot = True
    eval_no_crop = True
    n_tries = 10
    seed = 42
    output_dir = 'Result'
    running = True

    print("loading model", model_path)

    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    log_file = utils.uniquify(os.path.join(output_dir, log_file), sep='-')

    eval_data = test_dir
    net = utils.load_model(model_path, cuda)

    image_array = cv2.imread(os.path.join(test_dir, 'data', 'image.png'))
    mean_of_image = np.mean(image_array, axis=(0, 1))/1000
    std_of_image = np.std(image_array, axis=(0, 1))/100

    ###  Load the data into PyTorch using dataloader
    dataloader = utils.get_dataloader(test_dir,
            [0.6000, 0.3946, 0.6041],
            [0.2124, 0.2335, 0.2360],
            eval_no_crop, eval_rot, batch_size=1)

    #dataloader = utils.get_dataloader(test_dir, mean_of_image, std_of_image, eval_no_crop, eval_rot, batch_size=1)
    print(type(dataloader))

    '''
    # A function to get probabililties using only one iteration
    net = net.eval()
    for img, label in dataloader:
        print(type(img))
        print(type(label))

        img = img.cuda()
        print('IMAGE TYPE:',img)
        pred = net(img).data.cpu().numpy()
    print('checking', pred)
    probs = nn.functional.softmax(pred)
    print('PROBABILITIES:', probs)
    '''

    if not running:
        net.eval()
        utils.set_strategy(net, 'sample')
        have_do = utils.set_do_to_train(net)

        res = utils.predict_proba(dataloader, net, n_classes=5, return_logits=True, ensembles=n_tries, cuda=cuda)
        print('Result', res)
        '''
        eval_data['test'] = {
            'ensemble/proba': res[0],
            'ensemble/logits': res[2],
            'eval/labels': res[1],
            'ensemble/filenames': res[3]
        }    '''
    else:
        net.eval()
        utils.set_strategy(net, 'running')
        have_do = utils.set_do_to_train(net)

        res = utils.predict_proba(dataloader, net, n_classes=5, return_logits=True, ensembles=n_tries if have_do else 3)
        print('type(eval_data):', type(eval_data))
        '''
        eval_data['test'].update({
            'eval/proba': res[0],
            'eval/logits': res[2],
            'eval/labels': res[1],
            'ensemble/filenames': res[3]
        }) '''

    # Get the mean of predictions for n_tries iterations for each class
    prob_means_en = np.mean(res[0], axis=0)

    output_file_name = 'res_norotate'
    torch.save(res, output_dir + '/' + output_file_name)
    #print(res[2].shape)
    print('Created output file \'', output_file_name, ' \' ')

    torch.cuda.empty_cache()
    return (prob_means_en)
Пример #22
0
def finishRankOutput(dataFile,
                     rankOutput,
                     genome,
                     mergeFolder,
                     mergeName,
                     name1,
                     name2,
                     cutOff=1.5,
                     window=100000,
                     superOnly=True,
                     plotBam=True):
    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + 'output/', True)

    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, '\t')

    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable = [header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (
            genome, name2, genome, name2, name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (
            genome, name1, name2, genome, name1, name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (
            genome, name1, genome, name1, name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (
            genome, name2, genome, name2, name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (
            genome, name1, name2, genome, name1, name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (
            genome, name1, genome, name1, name2)
        lostBed = [[lostTrackHeader]]

    #the genes
    geneTable = [[
        'GENE', 'ENHANCER_ID', 'ENHANCER_CHROM', 'ENHANCER_START',
        'ENHANCER_STOP', header[6], header[7], header[8], 'STATUS'
    ]]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched', '')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ',')

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        #for gained
        if float(line[6]) > cutOff:
            gffLine = [
                line[1], line[0], '', line[2], line[3], '', '.', '', geneString
            ]
            gffWindowLine = [
                line[1], line[0], '',
                int(line[2]) - window,
                int(line[3]) + window, '', '.', '', geneString
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [
                line[1], line[0], '', line[2], line[3], '', '.', '', geneString
            ]
            gffWindowLine = [
                line[1], line[0], '',
                int(line[2]) - window,
                int(line[3]) + window, '', '.', '', geneString
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [
                gene, line[0], line[1], line[2], line[3], line[6], line[7],
                line[8], geneStatus
            ]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table

    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (
        outputFolder, genome, mergeName, enhancerType)
    utils.unParseTable(formattedRankTable, formattedFilename, '\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + 'gff/', True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (
        gffFolder, genome, mergeName, string.upper(name2), enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (
        gffFolder, genome, mergeName, string.upper(name2), enhancerType,
        window / 1000, window / 1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (
        gffFolder, genome, mergeName, string.upper(name1), enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (
        gffFolder, genome, mergeName, string.upper(name1), enhancerType,
        window / 1000, window / 1000)

    utils.unParseTable(gainedGFF, gffFilename_gained, '\t')
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, '\t')

    utils.unParseTable(lostGFF, gffFilename_lost, '\t')
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, '\t')

    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder, genome, mergeName,
                                             enhancerType)
    utils.unParseTable(fullBed, bedFilename, '\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (
        outputFolder, genome, mergeName, enhancerType)
    utils.unParseTable(geneTable, geneFilename, '\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (
        mergeFolder, name1, outputFolder, genome, mergeName, enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (
        mergeFolder, name1, outputFolder, genome, mergeName, enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1, bam2)
        nameString = "%s,%s" % (name1, name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)

        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilename_gained, outputFolder,
                nameString, colorString, plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilenameWindow_gained, outputFolder,
                nameString, colorString, plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilename_lost, outputFolder, nameString,
                colorString, plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilenameWindow_lost, outputFolder,
                nameString, colorString, plotTitle)
            os.system(cmd)

    return
Пример #23
0
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict,expressionTable):
    '''
    takes as input an activity table with refseq NMID in first column and expression or promoter
    acetylation level in a second column
    output a dictionary keyed by refseq containing activity
    '''

    print 'CREATING EXPRESSION DICTIONARY'

    annotTable = utils.parseTable(annotationFile, '\t')
    for line in annotTable:
        gid = line[1]
        genename = upper(line[12])
        refseqToNameDict[gid] = genename

    expresionFilename = projectFolder + 'matrix.gff'
    expressionTable = utils.parseTable(expresionFilename, '\t')

    expressionDictNM = {}
    expressionDictGene = {}

    for line in expressionTable[1:]:
        trid = line[0]
        geneName = refseqToNameDict[trid]
        if len(expressionTable[1]) == 3: #when expressionTable is an output from bamToGFF.py
                exp = float(line[2])
        else: #when expressionTable is passed as an option (2 columns)
                exp = float(line[1])

        # Store the expression value for each NMid in a dict, keep higher value if multiple identical NMIDs
        if trid in expressionDictNM and exp > expressionDictNM[trid]:
            expressionDictNM[trid] = exp
        elif trid not in expressionDictNM:
            expressionDictNM[trid] = exp

        # Store the highest value of transcript expression for each gene
        if geneName in expressionDictGene and exp > expressionDictGene[geneName]:
            expressionDictGene[geneName] = exp
        elif geneName not in expressionDictGene:
            expressionDictGene[geneName] = exp

    # Calculate the cutoff H3K27ac signal value to consider top 2/3 of genes expressed 
    # or the percentile of genes considered expressed passed in option
    cutoff = numpy.percentile(expressionDictGene.values(), 33)
    print 'Expression cutoff: ' + str(cutoff)

    # Select all NMids that are above the computed cutoff
    expressedGenes = []
    expressedNM = []
    for trid in expressionDictNM:
        if float(expressionDictNM[trid]) >= cutoff:
            expressedGenes.append(refseqToNameDict[trid])
            expressedNM.append(trid)
    expressedGenes = utils.uniquify(expressedGenes)

   # Output the list of transcripts considered expressed
    NMfilename = projectFolder + projectName + '_EXPRESSED_TRANSCRIPTS.txt'

   # Output the list of genes considered expressed
    Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt'

    utils.unParseTable(expressedNM, NMfilename, '')
    utils.unParseTable(expressedGenes, Genefilename, '')

    return expressedNM
Пример #24
0
def mapBamToGFF(bamFile,gff,sense = '.',extension = 200,rpm = False,clusterGram = None,matrix = None):
    '''maps reads from a bam to a gff'''

    #creating a new gff to output
    newGFF = []
    #reading in the bam
    bam = utils.Bam(bamFile)

    #getting RPM normalization
    if rpm:    
        MMR= round(float(bam.getTotalReads('mapped'))/1000000,4)
    else:
        MMR = 1

    print('using a MMR value of %s' % (MMR))

    #creating a sense trans 
    senseTrans = string.maketrans('-+.','+-+')
    
    #reading in the gff
    if type(gff) == str:
        gff = utils.parseTable(gff,'\t')

    #setting up a clustergram table
    if clusterGram:
        binSize = int(clusterGram)
        binSizeList = []
        #now go through each line of the gff and make sure they're all the same length
        for i in range(0,len(gff),1):
            line = gff[i]
            gffLocus = utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1])
            binSizeList.append(gffLocus.len()/binSize)
        binSizeList = utils.uniquify(binSizeList)
        if len(binSizeList) > 1: 
            print('WARNING: lines in gff are of different length. Output clustergram will have variable row length')
        newGFF.append(['GENE_ID','locusLine'] + [str(x*binSize)+'_'+bamFile.split('/')[-1] for x in range(1,max(binSizeList)+1,1)])        
        
    #setting up a maxtrix table
    if matrix:
        newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)])
        nBin = int(matrix)

    # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidatorString = '/opt/bamliquidator/pipeline/bamliquidator_internal/bamliquidator'
    if not os.path.isfile(bamliquidatorString):
        bamliquidatorString = './bamliquidator'
        if not os.path.isfile(bamliquidatorString):
            raise ValueError('bamliquidator not found in path')

    #getting and processing reads for gff lines
    ticker = 0
    print('Number lines processed')
    for line in gff:
        line = line[0:9]
        if ticker%100 == 0:
            print(ticker)
        ticker+=1
        gffLocus = utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1])
        
        #get the nBin and binSize
        if clusterGram:
            nBin =gffLocus.len()/int(clusterGram)
            binSize = int(clusterGram)
        if matrix:
            nBin = int(matrix)
            binSize = gffLocus.len()/nBin
            #some regions will be too short to get info on
            if binSize == 0:
                clusterLine = [gffLocus.ID(),gffLocus.__str__()] + ['NA']*nBin
                newGFF.append(clusterLine)
                continue


        #flippy flip if sense is negative
        if sense == '-':
            bamSense = string.translate(gffLocus.sense(),senseTrans)
        elif sense == '+':
            bamSense = gffLocus.sense()
        else:
            bamSense = '.'
        #using the bamLiquidator to get the readstring            
        #print('using nBin of %s' % nBin)
        bamCommand = "%s %s %s %s %s %s %s %s" % (bamliquidatorString,bamFile,line[0],gffLocus.start(),gffLocus.end(),bamSense,nBin,extension)
        #print(bamCommand)
        getReads = subprocess.Popen(bamCommand,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)
        readString, stderr = getReads.communicate()
        if stderr:
            print("STDERR out: %s" % (stderr))
        denList = readString.split('\n')[:-1]
        #print("denlist is: %s" % denList)
        #flip the denList if the actual gff region is -
        if gffLocus.sense() == '-':
            denList = denList[::-1]

        #converting from units of total bp of read sequence per bin to rpm/bp
        
        denList = [round(float(x)/binSize/MMR,4) for x in denList]
        
        #if the gff region is - strand, flip the

        clusterLine = [gffLocus.ID(),gffLocus.__str__()] + denList
        newGFF.append(clusterLine)

    return newGFF
Пример #25
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #26
0
 def _matrix_entry_maps(self):
     """Set of all mappings used in matrix arguments."""
     return list(uniquify(m for arg in self.args if arg._is_mat for m in arg.map))
Пример #27
0
 def _unique_matrix(self):
     return list(uniquify(a.data for a in self._matrix_args))
Пример #28
0
def make_mycn_stats_table(nb_all_chip_dataFile,outFile):

    '''
    making a table of conserved mycn peaks w/ some additional stats
    mycn and h3k27ac signal is avg. background normalized across 4 samples
    active tss defined as the union of all H3K27ac occupied promoters in NB
    active enhancers defined as the union of all H3K27ac sites outside of promoters
    '''
    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)

    print('SETTING UP OUTPUT TABLE')
    outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']]

    dinuc = nmers(2,['A','T','G','C'])

    #input files
    mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder)
    h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder)
    mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder)
    activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder)
    #note, this is the ucsc hg19 cpg islands extended file
    #to download and format run ./beds/download_cpg.sh
    cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder)
    enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder)

    print('LOADING MYCN BINDING DATA')
    mycnSignalTable = utils.parseTable(mycnSignalFile,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    #this only works if the first column are unique identifiers
    if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])):
        print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile))
        sys.exit()
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        mycn_sig_dict[line[0]] = numpy.mean(line_sig)


    
    print('LOADING MYCN RANK DATA')
    mycnRankTable = utils.parseTable(mycnRankFile,'\t')

    print('LOADING H3K27AC BINDING DATA')
    h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t')
    #making a signal dictionary for background subtracted H3K27ac binding
    names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = h3k27acSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    h3k27ac_sig_dict = {}
    #this only works if the first column are unique identifiers
    if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])):
        print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile))
        sys.exit()
    for line in h3k27acSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig)



    #making the cpg collection
    print('LOADING CPGS ISLANDS')
    cpgBed = utils.parseTable(cpgFile,'\t')
    cpgLoci = []
    for line in cpgBed:
        cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1]))
    cpgCollection = utils.LocusCollection(cpgLoci,50)
        
    #next make the tss collection of active promoters
    print('LOADING ACTIVE PROMOTERS')
    startDict = utils.makeStartDict(annotFile)
    activeTable = utils.parseTable(activeGeneFile,'\t')
    tss_1kb_loci = []
    for line in activeTable:
        tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000))
    tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50)


    #enhancer file
    print("LOADING ACTIVE ENHANCERS")
    enhancerTable = utils.parseTable(enhancerFile,'\t')
    print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB')
    print(len(enhancerTable) - 6)
    enhancerLoci = []
    for line in enhancerTable:
        if line[0][0] != '#' and line[0][0] != 'R':
            try:
                lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0])
                enhancerLoci.append(lineLocus)
            except IndexError:
                print(line)
                sys.exit()
    enhancerCollection = utils.LocusCollection(enhancerLoci,50)

    print('CLASSIFYING MYCN PEAKS')
    ticker = 0
    for i in range(1,len(mycnSignalTable)):
        if ticker%100 == 0:
            print(ticker)
        ticker +=1

        line = mycnSignalTable[i]        

        mycn_signal = round(mycn_sig_dict[line[0]],4)
        h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4)
        
        peakID = line[0]
        locusString = line[1]
        chrom = locusString.split('(')[0]
        [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')]
        lineLocus = utils.Locus(chrom,start,stop,'.',peakID)
        
        tssOverlap = 0
        if tss_1kb_collection.getOverlap(lineLocus,'both'):
            tssOverlap = 1

        enhancerOverlap = 0
        if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0:
            enhancerOverlap = 1

        cpgIslandOverlap = 0
        if cpgCollection.getOverlap(lineLocus,'both'):
            cpgIslandOverlap = 1

        #now do fractional cpgOverlap
        overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both')
        overlappingBases = 0
        for locus in overlappingCpGLoci:
            cpgStart = max(locus.start(),lineLocus.start())
            cpgEnd = min(locus.end(),lineLocus.end())
            overlappingBases += (cpgEnd-cpgStart)
        overlapFraction = round(float(overlappingBases)/lineLocus.len(),2)
        
        #now get the seq
        lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True))
        gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2)
            
        dinuc_dict = {}
        for nmer in dinuc:
            dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq)

        
        mycnRankLine = mycnRankTable[i]
        mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]])

        canonMatchList = re.findall('CACGTG',lineSeq)
        canon_count = len(canonMatchList)

        eboxMatchList = re.findall('CA..TG',lineSeq)
        ebox_count = len(eboxMatchList)

        non_canon_count = ebox_count-canon_count

        #get the expected values
        canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5)
        canon_exp = round(canon_exp,2)
        notCG = 1- dinuc_dict['CG']
        non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5)
        non_exp = round(non_exp,2)



        #for gata and GABPA
        gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq)
        gabpa_count = len(gabpaMatchList)

        gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5)
        gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5)
        
        gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2)

        gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq)
        gata_count = len(gataMatchList)

        an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC']
        cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC']
        gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5)
        gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5)
        gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2)

        
        

        newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp]
        outTable.append(newLine)

    utils.unParseTable(outTable,outFile,'\t')
    
    return outFile
Пример #29
0
def make_exp_table(probe_gene_dict, exp_1_path, exp_2_path, ob_s1_path,
                   exp_table_path, patient_table_path):
    '''
    making gene expression table
    first use an intermediary dictionary
    exp_dict[gene][patient] = list <- in case multiple exp values present at probe or patient level
    '''

    #check if this has already been done
    if utils.checkOutput(exp_table_path, 0, 0) and utils.checkOutput(
            patient_table_path, 0, 0):
        print(
            'loading premade expression table at %s and formatted patient table at %s'
            % (exp_table_path, patient_table_path))
        return exp_table_path, patient_table_path

    #first get a master list of patients
    print('formatting patient data')
    table_s1 = utils.parseTable(ob_s1_path, '\t')
    patient_list = utils.uniquify([line[0] for line in table_s1[1:]])
    #make a dict w/ relevant patient info
    patient_dict = {}
    for line in table_s1[1:]:
        patient_line = [line[3]] + line[8:12]
        if line[12] == 'Amp':
            patient_line.append(1)
        else:
            patient_line.append(0)
        patient_dict[line[0]] = patient_line

    gene_list = utils.uniquify(
        [probe_gene_dict[probe_id] for probe_id in probe_gene_dict.keys()])
    gene_list.sort()
    exp_dict = {}
    for gene in gene_list:
        exp_dict[gene] = {}
        for patient in patient_list:
            exp_dict[gene][patient] = []

    #next load up both expression tables
    exp_1 = utils.parseTable(exp_1_path, '\t')
    exp_2 = utils.parseTable(exp_2_path, '\t')

    exp_1_header = exp_1[0]
    exp_1_cols = [
        i for i in range(1, len(exp_1_header))
        if patient_list.count(exp_1_header[i].split('_')[1]) > 0
    ]
    exp_1_patients = [
        exp_1_header[i].split('_')[1] for i in range(1, len(exp_1_header))
        if patient_list.count(exp_1_header[i].split('_')[1]) > 0
    ]

    exp_2_header = exp_2[0]
    exp_2_cols = [
        i for i in range(1, len(exp_2_header))
        if patient_list.count(exp_2_header[i].split('_')[1]) > 0
    ]
    exp_2_patients = [
        exp_2_header[i].split('_')[1] for i in range(1, len(exp_2_header))
        if patient_list.count(exp_2_header[i].split('_')[1]) > 0
    ]

    print('loading expression dataset %s' % exp_1_path)
    for line in exp_1:
        if probe_gene_dict.has_key(line[0]):
            for i in range(len(exp_1_cols)):
                patient_name = exp_1_patients[i]
                col = exp_1_cols[i]
                expression = float(line[col])
                gene_name = probe_gene_dict[line[0]]
                exp_dict[gene_name][patient_name].append(expression)

    print('loading expression dataset %s' % exp_2_path)
    for line in exp_2:
        if probe_gene_dict.has_key(line[0]):
            for i in range(len(exp_2_cols)):
                patient_name = exp_2_patients[i]
                col = exp_2_cols[i]
                expression = float(line[col])
                gene_name = probe_gene_dict[line[0]]
                exp_dict[gene_name][patient_name].append(expression)

    print('making gene expression table')
    exp_table = [['GENE_NAME'] + exp_1_patients + exp_2_patients]
    for gene in gene_list:
        if min([
                len(exp_dict[gene][patient])
                for patient in exp_1_patients + exp_2_patients
        ]) == 0:
            print(gene)
            continue
        exp_line = [gene] + [
            numpy.mean(exp_dict[gene][patient])
            for patient in exp_1_patients + exp_2_patients
        ]
        exp_table.append(exp_line)

    patient_table = [[
        'PATIENT', 'STAGE', 'EFS_D', 'EFS_STATUS', 'OS_D', 'OS_STATUS',
        'MYCN_STATUS'
    ]]
    for patient in exp_1_patients + exp_2_patients:
        patient_line = [patient] + patient_dict[patient]
        patient_table.append(patient_line)

    utils.unParseTable(exp_table, exp_table_path, '\t')
    utils.unParseTable(patient_table, patient_table_path, '\t')

    return exp_table_path, patient_table_path
Пример #30
0
def geneToEnhancerDict(genome, enhancer_file, activity_path):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci 
    '''
    print('Identifying enhancers and target genes from %s' % (enhancer_file))
    #should this do gene assignment????
    #for now assume gene assignment has been done
    #can later toggle to do gene assignment

    #first load the TF lists

    tf_table = utils.parseTable(genome.returnFeature('tf_file'), '\t')

    motif_table = utils.parseTable(genome.returnFeature('motif_convert'), '\t')
    #this gives all tfs that have a motif
    motif_tfs = utils.uniquify([line[1] for line in motif_table])

    #intersect w/ the activity table
    if len(activity_path) > 0:
        activity_table = utils.parseTable(activity_path, '\t')
        #figure out the right column for actual gene names (basically not NM or NR and not a numeral)
        for i in range(len(activity_table[0])):
            # try:
            #     foo = int(activity_table[0][i])
            # except ValueError: # case where it is not an integer
            if activity_table[0][i][0:2] != 'NM' and activity_table[0][i][
                    0:2] != 'NR':  #assumes refseq
                gene_col = i
                break
        print('using column %s of %s gene activity table for common names' %
              (gene_col + 1, activity_path))

        active_gene_list = [
            string.upper(line[gene_col]) for line in activity_table
        ]

        tf_list_refseq = [
            line[0] for line in tf_table if active_gene_list.count(line[1]) > 0
            and motif_tfs.count(line[1]) > 0
        ]
        tf_list_name = utils.uniquify([
            line[1] for line in tf_table if active_gene_list.count(line[1]) > 0
            and motif_tfs.count(line[1]) > 0
        ])
    else:
        tf_list_refseq = [
            line[0] for line in tf_table if motif_tfs.count(line[1]) > 0
        ]
        tf_list_name = [
            line[1] for line in tf_table if motif_tfs.count(line[1]) > 0
        ]

    print('Identified %s TFs from %s that have motifs' %
          (len(tf_list_name), genome.returnFeature('tf_file')))

    #keyed by gene with loci objects in the list
    gene_to_enhancer_dict = defaultdict(list)
    enhancer_to_gene_dict = defaultdict(list)

    #assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output
    enhancer_table = utils.parseTable(enhancer_file, '\t')
    print('Analyzing %s cis-regulatory regions' % (len(enhancer_table)))

    #now let's make the enhancer table by region and then by gene
    enhancerTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']]
    enhancerTFTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']]
    geneTable = [['GENE', 'TF', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']]
    geneTFTable = [['GENE', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']]
    geneSummaryTable = [['GENE', 'TF', 'ENHANCER_LIST']]

    #will need to track which ones are TFs
    candidate_tf_list = []
    #find the columns for gene assignment
    header = enhancer_table[0]
    header_length = len(enhancer_table[0])
    closest_index = header.index('CLOSEST_GENE')
    proximal_index = header.index('PROXIMAL_GENES')
    overlap_index = header.index('OVERLAP_GENES')
    for line in enhancer_table[1:]:
        if len(
                line
        ) != header_length:  #don't bother trying to figure out lines w/o target genes
            continue
        enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0])
        closest_gene_list = line[closest_index].split(',')
        proximal_gene_list = line[proximal_index].split(',')
        overlap_gene_list = line[overlap_index].split(',')
        all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list
        all_gene_list = [string.upper(gene) for gene in all_gene_list]

        #print(all_gene_list)

        #print(activity_path)
        #print(active_gene_list)
        #gets a unique list of all tfs

        if len(activity_path) > 0:
            all_gene_list = utils.uniquify([
                gene for gene in all_gene_list
                if active_gene_list.count(gene) > 0
            ])
        else:
            all_gene_list = utils.uniquify(all_gene_list)
        candidate_gene_list = utils.uniquify(
            [gene for gene in all_gene_list if tf_list_name.count(gene) > 0])
        if len(all_gene_list) > 0:
            for gene in all_gene_list:

                gene_to_enhancer_dict[gene].append(enhancer_locus)
                enhancer_to_gene_dict[enhancer_locus].append(gene)
            newLine = line[0:4] + [','.join(all_gene_list)]
        else:
            newLine = line[0:4] + ['']
        enhancerTable.append(newLine)

        if len(candidate_gene_list) > 0:
            tfLine = line[0:4] + [','.join(candidate_gene_list)]
            enhancerTFTable.append(tfLine)

    #now iterate through each gene and list the enhancers
    gene_list = gene_to_enhancer_dict.keys()
    print(gene_list)
    gene_list.sort()
    for gene in gene_list:
        if tf_list_name.count(gene) > 0:
            tf_status = 1
            candidate_tf_list.append(gene)
        else:
            tf_status = 0
        enhancer_loci = gene_to_enhancer_dict[gene]
        enhancerString = ','.join(
            [enhancer.ID() for enhancer in enhancer_loci])
        geneSummaryTable.append([gene, tf_status, enhancerString])
        for enhancer in enhancer_loci:
            newLine = [
                gene, tf_status,
                enhancer.chr(),
                enhancer.start(),
                enhancer.end(),
                enhancer.ID()
            ]
            geneTable.append(newLine)
            if tf_status == 1:
                newLine = [
                    gene,
                    enhancer.chr(),
                    enhancer.start(),
                    enhancer.end(),
                    enhancer.ID()
                ]
                geneTFTable.append(newLine)

    return geneTable, geneTFTable, enhancerTable, enhancerTFTable, geneSummaryTable, candidate_tf_list, gene_to_enhancer_dict
Пример #31
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME] -s [SUBPEAKS] -x [EXP_CUTOFF] -l [EXTENSION_LENGTH]"
    parser = OptionParser(usage = usage)

    # Required flags                                                                                                                                        
    parser.add_option("-e","--enhancer_file", dest="enhancers",nargs = 1, default=None,
                      help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)")
    parser.add_option("-b","--bam_file",dest="bam",nargs =1, default = None,
                      help = "Provide a sorted indexed bam file for H3K27ac sequencing reads")
    parser.add_option("-g","--genome",dest="genome",nargs =1, default = None,
                      help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9")
    parser.add_option("-f","--fasta",dest="fasta",nargs =1, default = None,
                      help = "Enter location of the fasta files for the genome version used")
    parser.add_option("-s","--subpeaks", dest="subpeaks",nargs=1,default=None,
                      help = "Enter a bedfile of peaks output from MACS used to identify SE constituents")
    parser.add_option("-x","--exp_Cutoff", dest="expCutoff",nargs=1,default=33,
                      help = "Enter the percentage of transcripts that are not considered expressed, default=33")
    parser.add_option("-l","--extension_length", dest="extension",nargs = 1, default = 500,
                      help = "Enter the length (in bp) to extend constituents for motif search, default=500")
    parser.add_option("-n","--name",dest="name",nargs =1, default = None,
                      help = "Enter the sample name")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "Enter directory to be used for storing output")

    # Options                                                                                                               
    parser.add_option("-a","--activity", dest="activity",nargs = 1, default=None,
                      help = "Enter a two column table with refseq in the first column and the associated activity (expression or promoter acetylation level) in the second column")
    parser.add_option("-E","--enhancer_number", dest="Enumber",nargs = 1, default='supers',
                      help = "Enter the number of top ranked enhancers to include in the anlaysis, default = supers")

    (options,args) = parser.parse_args()

    print(options)


    if options.enhancers and options.bam and options.genome and options.fasta and options.subpeaks and options.expCutoff and options.extension and options.name and options.output:

        # Set parameters

        genomeDirectory = options.fasta

        genome = options.genome
        genome = upper(genome)

        if genome == 'HG19':
            annotationFile = './annotation/hg19_refseq.ucsc'
            TFfile = './TFlist_NMid_hg.txt'

        if genome == 'HG18':
            annotationFile = './annotation/hg18_refseq.ucsc'
            TFfile = './TFlist_NMid_hg.txt'

        if genome == 'MM9':
            annotationFile = './annotation/mm9_refseq.ucsc'
            TFfile = './TFlist_NMid_ms.txt'

        motifConvertFile = './MotifDictionary.txt'
        motifDatabaseFile = './VertebratePWMs.txt'

        TFtable = utils.parseTable(TFfile, '\t')
        TFlist = [line[0] for line in TFtable]
        TFlistGene = [line[1] for line in TFtable]

        superFile = options.enhancers
        superTable = utils.parseTable(superFile, '\t')

        bamFile = options.bam
        bam = utils.Bam(bamFile)

        subpeaks = options.subpeaks

        expCutoff = int(options.expCutoff)

        motifExtension = int(options.extension)

        projectName = options.name

        projectFolder = options.output

        refseqToNameDict = {}
        expressionFile = options.activity
        if expressionFile:
            expressionTable = utils.parseTable(expressionFile, '\t')
        else:
            calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict)
            expresionFilename = projectFolder + 'matrix.gff'
            expressionTable = utils.parseTable(expresionFilename, '\t')
        if options.Enumber != 'super':
            enhancerNumber = options.Enumber
        else:
            enhancerNumber = 'super'

        # Run the program

        superLoci = createSuperLoci(superTable)

        expressedNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expressionTable)

        TFandSuperDict = findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName)

        formatOutput(TFandSuperDict, refseqToNameDict, projectName, projectFolder)

        candidateGenes = [upper(refseqToNameDict[x]) for x in TFandSuperDict.keys()]

        candidateGenes = utils.uniquify(candidateGenes)

        generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension)

        findMotifs(candidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile)

        graph = buildNetwork(projectFolder, projectName, candidateGenes, refseqToNameDict, motifConvertFile)

        formatNetworkOutput(graph, projectFolder, projectName, candidateGenes)

    # Return help

    else:
        parser.print_help()
        sys.exit()
Пример #32
0
def tf_edge_delta_out(crc_folder,
                      chip_dataFile,
                      analysis_name,
                      group1_list,
                      group2_list,
                      output=''):
    '''
    calculates changes in brd4 out degree at each predicted motif occurrence this is by subpeaks
    '''
    crc_folder = utils.formatFolder(crc_folder, False)
    edge_path = '%s%s_EDGE_TABLE.txt' % (crc_folder, analysis_name)

    #make a gff of the edge table
    edge_table = utils.parseTable(edge_path, '\t')
    edge_gff = []
    for line in edge_table[1:]:
        gff_line = [
            line[2],
            '%s_%s' % (line[0], line[1]), '', line[3], line[4], '', '.', '',
            '%s_%s' % (line[0], line[1])
        ]
        edge_gff.append(gff_line)

    edge_gff_path = '%s%s_EDGE_TABLE.gff' % (crc_folder, analysis_name)
    utils.unParseTable(edge_gff, edge_gff_path, '\t')

    #direct the output to the crc folder
    signal_path = '%s%s_EDGE_TABLE_signal.txt' % (crc_folder, analysis_name)

    #get a list of all chip datasets
    all_chip_list = group1_list + group2_list
    if utils.checkOutput(signal_path, 0, 0) == False:
        signal_table_list = pipeline_dfci.map_regions(chip_dataFile,
                                                      [edge_gff_path],
                                                      mappedFolder,
                                                      signalFolder,
                                                      all_chip_list,
                                                      True,
                                                      signal_path,
                                                      extendReadsTo=100)
        print(signal_table_list)
    else:
        print('Found previous signal table at %s' % (signal_path))

    #now bring in the signal table as a dictionary using the locus line as the id
    print('making log2 group1 over group2 signal table at edges')
    signal_table = utils.parseTable(signal_path, '\t')
    signal_dict = defaultdict(float)

    #figure out columns for group1 and group2
    group2_columns = [signal_table[0].index(name) for name in group2_list]
    group1_columns = [signal_table[0].index(name) for name in group1_list]
    group2_signal_vector = []
    group1_signal_vector = []
    for line in signal_table[1:]:
        group2_signal = numpy.mean(
            [float(line[col]) for col in group2_columns])
        group1_signal = numpy.mean(
            [float(line[col]) for col in group1_columns])

        group2_signal_vector.append(group2_signal)
        group1_signal_vector.append(group1_signal)

    group2_median = numpy.median(group2_signal_vector)
    group1_median = numpy.median(group1_signal_vector)

    print('group2 median signal (rpm/bp)')
    print(group2_median)
    print('group1 median signal (rpm/bp)')
    print(group1_median)

    #now that we have the median, we can take edges where at least 1 edge is above the median
    #and both are above zero and generate a new table w/ the fold change

    signal_filtered_path = string.replace(signal_path, '.txt', '_filtered.txt')
    if utils.checkOutput(signal_filtered_path, 0, 0):
        print('Found filtered signal table for edges at %s' %
              (signal_filtered_path))
        signal_table_filtered = utils.parseTable(signal_filtered_path, '\t')
    else:

        signal_table_filtered = [
            signal_table[0] +
            ['GROUP2_MEAN', 'GROUP1_MEAN', 'LOG2_GROUP1_OVER_GROUP2']
        ]
        for line in signal_table[1:]:
            group2_signal = numpy.mean(
                [float(line[col]) for col in group2_columns])
            group1_signal = numpy.mean(
                [float(line[col]) for col in group1_columns])

            if (group2_signal > group2_median or group1_signal > group1_median
                ) and min(group2_signal, group1_signal) > 0:
                delta = numpy.log2(group1_signal / group2_signal)
                new_line = line + [group2_signal, group1_signal, delta]
                signal_table_filtered.append(new_line)

        utils.unParseTable(signal_table_filtered, signal_filtered_path, '\t')

    #now get a list of all TFs in the system
    tf_list = utils.uniquify(
        [line[0].split('_')[0] for line in signal_table_filtered[1:]])
    tf_list.sort()
    print(tf_list)

    out_degree_table = [[
        'TF_NAME', 'EDGE_COUNT', 'DELTA_MEAN', 'DELTA_MEDIAN', 'DELTA_STD',
        'DELTA_SEM'
    ]]

    for tf_name in tf_list:
        print(tf_name)
        edge_vector = [
            float(line[-1]) for line in signal_table_filtered[1:]
            if line[0].split('_')[0] == tf_name
        ]

        edge_count = len(edge_vector)
        delta_mean = round(numpy.mean(edge_vector), 4)
        delta_median = round(numpy.median(edge_vector), 4)
        delta_std = round(numpy.std(edge_vector), 4)
        delta_sem = round(stats.sem(edge_vector), 4)
        tf_out_line = [
            tf_name, edge_count, delta_mean, delta_median, delta_std, delta_sem
        ]
        out_degree_table.append(tf_out_line)

    if output == '':
        #set final output
        output_path = '%s%s_EDGE_DELTA_OUT.txt' % (crc_folder, analysis_name)

    else:
        output_path = output

    utils.unParseTable(out_degree_table, output_path, '\t')
    print(output_path)
    return (output_path)
Пример #33
0
def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True):
    print('PERFORMING REGION STITCHING')
    # first have to turn bound region file into a locus collection

    # need to make sure this names correctly... each region should have a unique name
    #referenceCollection 

    debugOutput = []
    # filter out all bound regions that overlap the TSS of an ACTIVE GENE
    if removeTSS:

        print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow))
        # first make a locus collection of TSS

        startDict = utils.makeStartDict(annotFile)

        # now makeTSS loci for active genes
        removeTicker = 0
        # this loop makes a locus centered around +/- tssWindow of transcribed genes
        # then adds it to the list tssLoci
        tssLoci = []
        for geneID in startDict.keys():
            tssLoci.append(utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow))

        # this turns the tssLoci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tssCollection = utils.LocusCollection(tssLoci, 50)

        # gives all the loci in referenceCollection
        boundLoci = referenceCollection.getLoci()

        # this loop will check if each bound region is contained by the TSS exclusion zone
        # this will drop out a lot of the promoter only regions that are tiny
        # typical exclusion window is around 2kb
        for locus in boundLoci:
            if len(tssCollection.getContainers(locus, 'both')) > 0:

                # if true, the bound locus overlaps an active gene
                referenceCollection.remove(locus)
                debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED'])
                removeTicker += 1
        print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker))

    # referenceCollection is now all enriched region loci that don't overlap an active TSS

    if stitchWindow == '':
        print('DETERMINING OPTIMUM STITCHING PARAMTER')
        optCollection = copy.deepcopy(referenceCollection)
        stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500)
    print('USING A STITCHING PARAMETER OF %s' % stitchWindow)
    stitchedCollection = referenceCollection.stitchCollection(stitchWindow, 'both')

    if removeTSS:
        # now replace any stitched region that overlap 2 distinct genes
        # with the original loci that were there
        fixedLoci = []
        tssLoci = []
        for geneID in startDict.keys():
            tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50))

        # this turns the tssLoci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tssCollection = utils.LocusCollection(tssLoci, 50)
        removeTicker = 0
        originalTicker = 0
        for stitchedLocus in stitchedCollection.getLoci():
            overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus, 'both')
            tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci]
            tssNames = utils.uniquify(tssNames)
            if len(tssNames) > 2:

                # stitchedCollection.remove(stitchedLocus)
                originalLoci = referenceCollection.getOverlap(stitchedLocus, 'both')
                originalTicker += len(originalLoci)
                fixedLoci += originalLoci
                debugOutput.append([stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS'])
                removeTicker += 1
            else:
                fixedLoci.append(stitchedLocus)

        print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker))
        print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker))
        fixedCollection = utils.LocusCollection(fixedLoci, 50)
        return fixedCollection, debugOutput, stitchWindow
    else:
        return stitchedCollection, debugOutput, stitchWindow
Пример #34
0
def regionStitching(referenceCollection,
                    name,
                    outFolder,
                    stitchWindow,
                    tssWindow,
                    annotFile,
                    removeTSS=True):
    print('PERFORMING REGION STITCHING')
    # first have to turn bound region file into a locus collection

    # need to make sure this names correctly... each region should have a unique name
    #referenceCollection

    debugOutput = []
    # filter out all bound regions that overlap the TSS of an ACTIVE GENE
    if removeTSS:

        print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' %
              (tssWindow))
        # first make a locus collection of TSS

        startDict = utils.makeStartDict(annotFile)

        # now makeTSS loci for active genes
        removeTicker = 0
        # this loop makes a locus centered around +/- tssWindow of transcribed genes
        # then adds it to the list tssLoci
        tssLoci = []
        for geneID in startDict.keys():
            tssLoci.append(
                utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow))

        # this turns the tssLoci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tssCollection = utils.LocusCollection(tssLoci, 50)

        # gives all the loci in referenceCollection
        boundLoci = referenceCollection.getLoci()

        # this loop will check if each bound region is contained by the TSS exclusion zone
        # this will drop out a lot of the promoter only regions that are tiny
        # typical exclusion window is around 2kb
        for locus in boundLoci:
            if len(tssCollection.getContainers(locus, 'both')) > 0:

                # if true, the bound locus overlaps an active gene
                referenceCollection.remove(locus)
                debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED'])
                removeTicker += 1
        print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' %
              (removeTicker))

    # referenceCollection is now all enriched region loci that don't overlap an active TSS

    if stitchWindow == '':
        print('DETERMINING OPTIMUM STITCHING PARAMTER')
        optCollection = copy.deepcopy(referenceCollection)
        stitchWindow = optimizeStitching(optCollection,
                                         name,
                                         outFolder,
                                         stepSize=500)
    print('USING A STITCHING PARAMETER OF %s' % stitchWindow)
    stitchedCollection = referenceCollection.stitchCollection(
        stitchWindow, 'both')

    if removeTSS:
        # now replace any stitched region that overlap 2 distinct genes
        # with the original loci that were there
        fixedLoci = []
        tssLoci = []
        for geneID in startDict.keys():
            tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50))

        # this turns the tssLoci list into a LocusCollection
        # 50 is the internal parameter for LocusCollection and doesn't really matter
        tssCollection = utils.LocusCollection(tssLoci, 50)
        removeTicker = 0
        originalTicker = 0
        for stitchedLocus in stitchedCollection.getLoci():
            overlappingTSSLoci = tssCollection.getOverlap(
                stitchedLocus, 'both')
            tssNames = [
                startDict[tssLocus.ID()]['name']
                for tssLocus in overlappingTSSLoci
            ]
            tssNames = utils.uniquify(tssNames)
            if len(tssNames) > 2:

                # stitchedCollection.remove(stitchedLocus)
                originalLoci = referenceCollection.getOverlap(
                    stitchedLocus, 'both')
                originalTicker += len(originalLoci)
                fixedLoci += originalLoci
                debugOutput.append([
                    stitchedLocus.__str__(),
                    stitchedLocus.ID(), 'MULTIPLE_TSS'
                ])
                removeTicker += 1
            else:
                fixedLoci.append(stitchedLocus)

        print(
            'REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' %
            (removeTicker))
        print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker))
        fixedCollection = utils.LocusCollection(fixedLoci, 50)
        return fixedCollection, debugOutput, stitchWindow
    else:
        return stitchedCollection, debugOutput, stitchWindow
Пример #35
0
def makePeakTable(paramDict,
                  splitGFFPath,
                  averageTablePath,
                  startDict,
                  geneList,
                  genomeDirectory,
                  tads_path=''):
    '''
    makes the final peak table with ebox info
    '''

    peakTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG',
        'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT',
        'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES',
        'PROXIMAL_GENES'
    ]]

    print('LOADING PEAK REGIONS')
    peakGFF = utils.parseTable(splitGFFPath, '\t')

    print('LOADING BINDING DATA')
    signalTable = utils.parseTable(averageTablePath, '\t')

    print('LOADING CPGS ISLANDS')
    cpgBed = utils.parseTable(paramDict['cpgPath'], '\t')
    cpgLoci = []
    for line in cpgBed:
        cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1]))
    cpgCollection = utils.LocusCollection(cpgLoci, 50)

    print("MAKING TSS COLLECTIONS")
    if len(geneList) == 0:
        geneList = startDict.keys()

    tss_1kb_loci = []
    tss_50kb_loci = []
    for refID in geneList:
        tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000))
        tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000,
                                                50000))

    #make a 1kb flanking and 50kb flanking collection
    tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50)
    tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50)

    if len(tads_path) > 0:
        print('LOADING TADS FROM %s' % (tads_path))
        tad_collection = utils.importBoundRegion(tads_path, 'tad')
        use_tads = True

        #building a tad dict keyed by tad ID w/ genes in that tad provided
        tad_dict = defaultdict(list)
        for tss_locus in tss_1kb_loci:
            overlapping_tads = tad_collection.getOverlap(tss_locus, 'both')
            for tad_locus in overlapping_tads:
                tad_dict[tad_locus.ID()].append(tss_locus.ID())

    else:
        use_tads = False

    print('CLASSIFYING PEAKS')
    ticker = 0

    no_tad_count = 0
    for i in range(len(peakGFF)):
        if ticker % 1000 == 0:
            print(ticker)
        ticker += 1

        #getting the particulars of the region
        gffLine = peakGFF[i]
        peakID = gffLine[1]
        chrom = gffLine[0]
        start = int(gffLine[3])
        stop = int(gffLine[4])
        lineLocus = utils.Locus(chrom, start, stop, '.', peakID)

        #getting the mapped signal
        signalLine = signalTable[(i + 1)]
        signalVector = [float(x) for x in signalLine[2:]]

        #setting up the new line
        newLine = [peakID, chrom, start, stop, lineLocus.len()]

        #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier
        newLine.append(gffLine[7])

        #check cpg status
        if cpgCollection.getOverlap(lineLocus, 'both'):
            newLine.append(1)
        else:
            newLine.append(0)

        #now do fractional cpgOverlap
        overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both')
        overlappingBases = 0
        for locus in overlappingCpGLoci:
            cpgStart = max(locus.start(), lineLocus.start())
            cpgEnd = min(locus.end(), lineLocus.end())
            overlappingBases += (cpgEnd - cpgStart)
        overlapFraction = float(overlappingBases) / lineLocus.len()

        newLine.append(round(overlapFraction, 2))

        #now get the seq
        lineSeq = string.upper(
            utils.fetchSeq(genomeDirectory, chrom, start, stop, True))
        if len(lineSeq) == 0:
            print('UH OH')
            print(lineSeq)
            print(gffLine)
            print(i)
            print(chrom)
            print(start)
            print(stop)
            sys.exit()

        gcFreq = float(lineSeq.count('GC') +
                       lineSeq.count('CG')) / len(lineSeq)
        newLine.append(gcFreq)

        #this is where we add the ChIP-Seq signal
        newLine += signalVector

        eboxMatchList = re.findall('CA..TG', lineSeq)
        if len(eboxMatchList) == 0:
            newLine += [0] * 3
        else:
            totalCount = len(eboxMatchList)
            canonCount = eboxMatchList.count('CACGTG')
            otherCount = totalCount - canonCount
            newLine += [canonCount, otherCount, totalCount]

        #now find the overlapping and proximal genes
        #here each overlapping gene the tss 1kb locus overlaps the peak

        if use_tads:

            tad_loci = tad_collection.getOverlap(lineLocus, 'both')

            tad_id_list = [tad_locus.ID() for tad_locus in tad_loci]
            tad_genes = []
            for tad_id in tad_id_list:
                tad_genes += tad_dict[tad_id]
            if len(tad_genes) == 0:
                #print('no tad for this region')
                #print(gffLine)
                no_tad_count += 1
        else:
            tad_genes = []

        if len(tad_genes) > 0:
            overlappingGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_1kb_collection.getOverlap(lineLocus, 'both')
                if tad_genes.count(locus.ID()) > 0
            ]
            proximalGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_50kb_collection.getOverlap(lineLocus, 'both')
                if tad_genes.count(locus.ID()) > 0
            ]
            # print('linked peak to tad genes')
            # print([startDict[x]['name'] for x in tad_genes])
            # print(tad_id_list)
            # print(gffLine)
            # print(overlappingGenes)
            # print(proximalGenes)
        else:
            overlappingGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_1kb_collection.getOverlap(lineLocus, 'both')
            ]
            proximalGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_50kb_collection.getOverlap(lineLocus, 'both')
            ]

        overlappingGenes = utils.uniquify(overlappingGenes)
        #here the tss 50kb locus overlaps the peak
        #overlap takes priority over proximal
        proximalGenes = [
            gene for gene in proximalGenes if overlappingGenes.count(gene) == 0
        ]
        proximalGenes = utils.uniquify(proximalGenes)

        overlappingString = string.join(overlappingGenes, ',')
        proximalString = string.join(proximalGenes, ',')

        newLine += [overlappingString, proximalString]

        peakTable.append(newLine)

    print('Out of %s regions, %s were assigned to at least 1 tad' %
          (len(peakTable), no_tad_count))
    return peakTable
Пример #36
0
def mapBamToGFF(bamFile,
                gff,
                sense='.',
                extension=200,
                rpm=False,
                clusterGram=None,
                matrix=None):
    '''maps reads from a bam to a gff'''

    #creating a new gff to output
    newGFF = []
    #reading in the bam
    bam = utils.Bam(bamFile)

    #getting RPM normalization
    if rpm:
        MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4)
    else:
        MMR = 1

    print('using a MMR value of %s' % (MMR))

    #creating a sense trans
    senseTrans = string.maketrans('-+.', '+-+')

    #reading in the gff
    if type(gff) == str:
        gff = utils.parseTable(gff, '\t')

    #setting up a clustergram table
    if clusterGram:
        binSize = int(clusterGram)
        binSizeList = []
        #now go through each line of the gff and make sure they're all the same length
        for i in range(0, len(gff), 1):
            line = gff[i]
            gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]),
                                   line[6], line[1])
            binSizeList.append(gffLocus.len() / binSize)
        binSizeList = utils.uniquify(binSizeList)
        if len(binSizeList) > 1:
            print(
                'WARNING: lines in gff are of different length. Output clustergram will have variable row length'
            )
        newGFF.append(['GENE_ID', 'locusLine'] + [
            str(x * binSize) + '_' + bamFile.split('/')[-1]
            for x in range(1,
                           max(binSizeList) + 1, 1)
        ])

    #setting up a maxtrix table
    if matrix:
        newGFF.append(['GENE_ID', 'locusLine'] + [
            'bin_' + str(n) + '_' + bamFile.split('/')[-1]
            for n in range(1,
                           int(matrix) + 1, 1)
        ])
        nBin = int(matrix)

    # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidatorString = '/usr/bin/bamliquidator'
    if not os.path.isfile(bamliquidatorString):
        bamliquidatorString = './bamliquidator'
        if not os.path.isfile(bamliquidatorString):
            raise ValueError('bamliquidator not found in path')

    #getting and processing reads for gff lines
    ticker = 0
    print('Number lines processed')
    for line in gff:
        line = line[0:9]
        if ticker % 100 == 0:
            print(ticker)
        ticker += 1
        gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6],
                               line[1])

        #get the nBin and binSize
        if clusterGram:
            nBin = gffLocus.len() / int(clusterGram)
            binSize = int(clusterGram)
        if matrix:
            nBin = int(matrix)
            binSize = gffLocus.len() / nBin
            #some regions will be too short to get info on
            if binSize == 0:
                clusterLine = [gffLocus.ID(),
                               gffLocus.__str__()] + ['NA'] * nBin
                newGFF.append(clusterLine)
                continue

        #flippy flip if sense is negative
        if sense == '-':
            bamSense = string.translate(gffLocus.sense(), senseTrans)
        elif sense == '+':
            bamSense = gffLocus.sense()
        else:
            bamSense = '.'
        #using the bamLiquidator to get the readstring
        #print('using nBin of %s' % nBin)
        bamCommand = "%s %s %s %s %s %s %s %s" % (
            bamliquidatorString, bamFile, line[0], gffLocus.start(),
            gffLocus.end(), bamSense, nBin, extension)
        #print(bamCommand)
        getReads = subprocess.Popen(bamCommand,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    shell=True)
        readString, stderr = getReads.communicate()
        if stderr:
            print("STDERR out: %s" % (stderr))
        denList = readString.split('\n')[:-1]
        #print("denlist is: %s" % denList)
        #flip the denList if the actual gff region is -
        if gffLocus.sense() == '-':
            denList = denList[::-1]

        #converting from units of total bp of read sequence per bin to rpm/bp

        denList = [round(float(x) / binSize / MMR, 4) for x in denList]

        #if the gff region is - strand, flip the

        clusterLine = [gffLocus.ID(), gffLocus.__str__()] + denList
        newGFF.append(clusterLine)

    return newGFF
Пример #37
0
def loadAnnotFile(genome, window, geneList=[], skip_cache=False):
    """
    load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome
    """
    genomeDict = {
        'HG18': 'annotation/hg18_refseq.ucsc',
        'MM9': 'annotation/mm9_refseq.ucsc',
        'MM10': 'annotation/mm10_refseq.ucsc',
        'HG19': 'annotation/hg19_refseq.ucsc',
        'HG19_RIBO': 'annotation/hg19_refseq.ucsc',
        'RN4': 'annotation/rn4_refseq.ucsc',
        'RN6': 'annotation/rn6_refseq.ucsc',
        'HG38': 'annotation/hg38_refseq.ucsc',
    }

    genomeDirectoryDict = {
        'HG19':
        '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/',
        'RN6':
        '/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/',
        'MM9':
        '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/',
        'MM10':
        '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/',
        'HG38':
        '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/',
    }

    mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI)

    #making a dictionary for mouse to human conversion
    mouse_convert_dict = defaultdict(str)

    mouse_convert_table = utils.parseTable(mouse_convert_file, '\t')
    for line in mouse_convert_table:
        mouse_convert_dict[line[4]] = line[0]

    genomeDirectory = genomeDirectoryDict[string.upper(genome)]

    #making a chrom_dict that is a list of all chroms with sequence
    chrom_list = utils.uniquify([
        name.split('.')[0] for name in os.listdir(genomeDirectory)
        if len(name) > 0
    ])

    annotFile = whereAmI + '/' + genomeDict[string.upper(genome)]

    if not skip_cache:
        # Try loading from a cache, if the crc32 matches
        annotPathHash = zlib.crc32(
            annotFile) & 0xFFFFFFFF  # hash the entire location of this script
        annotFileHash = zlib.crc32(open(annotFile, "rb").read()) & 0xFFFFFFFF

        cache_file_name = "%s.%s.%s.cache" % (genome, annotPathHash,
                                              annotFileHash)

        cache_file_path = '%s/%s' % (tempfile.gettempdir(), cache_file_name)

        if os.path.isfile(cache_file_path):
            # Cache exists! Load it!
            try:
                print('\tLoading genome data from cache.')
                with open(cache_file_path, 'rb') as cache_fh:
                    cached_data = cPickle.load(cache_fh)
                    print('\tCache loaded.')
                return cached_data
            except (IOError, cPickle.UnpicklingError):
                # Pickle corrupt? Let's get rid of it.
                print('\tWARNING: Cache corrupt or unreadable. Ignoring.')
        else:
            print('\tNo cache exists: Loading annotation (slow).')

    # We're still here, so either caching was disabled, or the cache doesn't exist

    startDict = utils.makeStartDict(annotFile, geneList)
    tssLoci = []
    if geneList == []:
        geneList = startDict.keys()
    for gene in geneList:
        tssLoci.append(utils.makeTSSLocus(gene, startDict, window, window))

    tssCollection = utils.LocusCollection(tssLoci, 50)

    if not skip_cache:
        print('Writing cache for the first time.')
        with open(cache_file_path, 'wb') as cache_fh:
            cPickle.dump((startDict, tssCollection), cache_fh,
                         cPickle.HIGHEST_PROTOCOL)

    return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict
Пример #38
0
def main():

    '''
    using argparse

    '''

    parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES')

    # required flags
    parser.add_argument("-d", "--data_table", dest="data_table", type=str,
                      help="input a data table with all datasets to be analyzed", required=True)
    parser.add_argument("-1", "--group1", dest="group1", type=str,
                      help="input a comma separated list of all datasets in group1", required=True)
    parser.add_argument("-2", "--group2", dest="group2", type=str,
                      help="input a comma separated list of all datasets in group2", required=True)


    #optional input override
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="input a gff of regions to analyze", required=False)


    #optional arguments
    parser.add_argument("-n", "--name", dest="name", type=str,
                      help="specify a name for the analysis. Default is drawn from the data table name", required=False)

    parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str,
                      help="Enter a name for group1.  Default is 'GROUP1'", required=False)
    parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str,
                      help="Enter a name for group2.  Default is 'GROUP2'", required=False)

    parser.add_argument("-a", "--activity", dest="activity", type=str,default='',
                      help="a table with active gene names in the first column", required=False)
    parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500,
                      help="Specify a TSS exclusion distance. Default is 2500", required=False)
    parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None,
                      help="Specify a stitching distance. Default is auto stitching", required=False)



    parser.add_argument("-o", "--output", dest="output", default='./',type=str,
                      help="Enter the output folder. Default is the current working directory", required=False)

    parser.add_argument("--log", dest="log", default='',type=str,
                      help="Enter a path to log output", required=False)



#     # DEBUG OPTION TO SAVE TEMP FILES
#     parser.add_argument("--scale", dest="scale", default='',
#                       help="Enter a comma separated list of scaling factors for your bams. Default is none")
#     parser.add_argument("--save-temp", dest="save", action='store_true', default=False,
#                       help="If flagged will save temporary files made by bamPlot")
#     parser.add_argument("--bed", dest="bed",
#                       help="Add a space-delimited list of bed files to plot")
#     parser.add_argument("--multi-page", dest="multi", action='store_true', default=False,
#                       help="If flagged will create a new pdf for each region")

    args = parser.parse_args()



    #now we can begin to parse the arguments
    
    #=====================================================================================
    #===============================I. PARSING ARGUMENTS==================================
    #=====================================================================================
    #pulling in the data table
    data_file = os.path.abspath(args.data_table)
    dataDict = pipeline_dfci.loadDataTable(data_file)

    #setting naming conventions
    if not args.name:
        analysis_name = data_file.split('/')[-1].split('.')[0]
    else:
        analysis_name = args.name

    #getting the optional input gff
    if args.input:
        inputGFF = args.input
    else:
        inputGFF = ''

    #getting group names
    group1_name = args.group1_name
    group2_name = args.group2_name

    #getting group1 
    group1_string = args.group1
    group1_list = [name for name in string.split(group1_string,',') if len(name) > 0]

    #getting group2
    group2_string = args.group2
    group2_list = [name for name in string.split(group2_string,',') if len(name) > 0]

    #checking that all datasets are in the data table
    for name in group1_list + group2_list:
        if name not in dataDict:
            print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file))
            sys.exit()

    #loading in the genome object from the data table
    genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list])
    if len(genome_list) > 1:
        print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.')
        sys.exit()

    
    #the load genome function has an assertion test to make sure the genome is supported
    genome = loadGenome(genome_list[0])

    
    parent_folder = utils.formatFolder(args.output,True)
    output_folder = utils.formatFolder(parent_folder + analysis_name,True)


    #these are the user defined optional arguments
    tss = int(args.tss)

    stitch = args.stitch
    print('stitch')
    print(stitch)

    
    #list of active genes to constrain analysis 
    if len(args.activity) == 0:
        #assumes all genes are active unless told otherwise
        #activity_path,activity_table = getActivity() # fix this function
        print('using all active genes')
    else:
        activity_path = args.activity
        activity_table = utils.parseTable(activity_path,'\t')




    print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n')

    print('Analyzing datasets described in %s\n' % (data_file))

    print('Name for the analysis: %s\n' % (analysis_name))
    print('Using genome: %s\n' % (genome.name()))


    
    print('%s datasets: %s\n' % (group1_name,group1_string))
    print('%s datasets: %s\n' % (group2_name,group2_string))

    if len(activity_path) > 0:
        print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path))
    else:
        print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name))
    print('Writing output to: %s\n' % (output_folder))


    #=====================================================================================
    #======================II. DEFINING CIS-REGULATORY ELEMENTS===========================
    #=====================================================================================


    print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n')



    #crc_wrapper will act at the group level and not consider individual datasets
    #since a data table is used as the input, the code will rely heavily on pipeline_dfci
    #embedded tools

    #1. first we need to run meta rose using default parameters and check the output
    #exists for each group

    meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True)

    group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name)

    group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name)
    #print(group1_output)
    #print(group2_output)

    #for each output check to see if they exist
    #if not launch

    try:
        foo = open(group1_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name))
        launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss)
        
    try:
        foo = open(group2_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name))
        launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss)



    #now check for completion
    if utils.checkOutput(group1_output,1,10):
        print('META_ROSE finished for %s' % (group1_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name))
        sys.exit()

    if utils.checkOutput(group2_output,1,10):
        print('META_ROSE finished for %s' % (group2_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name))
        sys.exit()


    #Meta rose does not give all regions that are SE in at least one sample
    #and can be blown out by amplicons etc...
    #sooo we need to run clustering to generate a good input gff
    #ideally we just rewrite dynamic meta to run off of clustering output
    #until we do that let's just overwrite w/ an input gff
    

    print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name))
    dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True)

    #here we will use the rank table as the primary output
    dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name)
    
    try:
        foo = open(dynamic_rose_output,'r')
    except IOError:
        print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name))
        launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF)

    if utils.checkOutput(dynamic_rose_output,1,10):
        print('DYNAMIC_ROSE finsihed for %s' % (analysis_name))
    else:
        print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name))
        sys.exit()




    #=====================================================================================
    #======================III. IDENTIFYING TF NODES IN NETWORK===========================
    #=====================================================================================


    print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n')




    #now we want to call circuitry on each group... ok to have different subpeaks and motif calls
    #if as a first approximation we weight by the overall enhancer




    crc_folder = utils.formatFolder('%scrc/' % (output_folder),True)



    #for all
    all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path)



    #for group1
    group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path)

    #for group2
    group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)
Пример #39
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-b",
        "--bams",
        dest="bams",
        nargs=1,
        default=None,
        help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c",
                      "--control",
                      dest="control",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print(
            'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd),
        'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(inputGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch.py'

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #40
0
def findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM,
                    bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci
    '''

    print 'FINDING CANIDATE TFs'

    enhancerAssignment = []
    TFtoEnhancerDict = defaultdict(list)

    startDict = utils.makeStartDict(annotationFile)    

    tssLoci = []
    for gene in expressedNM:
        tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000))
    tssCollection = utils.LocusCollection(tssLoci,50)    


    # Loop through enhancers
    for enhancer in enhancerLoci:
        

        # If the enhancer overlaps a TSS, save it
        overlappingLoci = tssCollection.getOverlap(enhancer, 'both')
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # Find all gene TSS within 100 kb
        proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both')
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())
        
        # If no genes are within 100 kb, find the closest active gene
        closestGene = ''
        if len(overlappingGenes) == 0 and len(proximalGenes) == 0:
        
            distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both')
            distalGenes =[]
            for distalLocus in distalLoci:
                distalGenes.append(distalLocus.ID())

            enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in distalGenes]
            if distList:
                closestGene = distalGenes[distList.index(min(distList))]


        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)
 

        # If a TSS overlaps an enhancer, assign them together
        if overlappingGenes:
            for gene in overlappingGenes:
                if gene in TFlist:
                    TFtoEnhancerDict[gene].append(enhancer)
                    enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
                
        # Otherwise, assign the enhancer to the most active gene in 100 kb
        elif not overlappingGenes and proximalGenes:
            highestGene = ''
            highestActivity = 0
            for gene in proximalGenes:
                if expressionDictNM[gene] > highestActivity:
                    highestActivity = expressionDictNM[gene]
                    highestGene = gene
            if highestGene in TFlist:
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
            
        elif not overlappingGenes and not proximalGenes and closestGene:
            if closestGene in TFlist:
                gene = closestGene
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])

    # Add promoter is it's not contained in the super
    if promoter:
        for gene in TFtoEnhancerDict.keys():
            promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, 
                                   int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense'])
            overlapBool = False
            for enhancer in TFtoEnhancerDict[gene]:
                if promoter.overlaps(enhancer):
                    overlapBool = True
            if not overlapBool:
                TFtoEnhancerDict[gene].append(promoter)

    seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt'
    utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t')

    return TFtoEnhancerDict
Пример #41
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-n",
        "--name",
        dest="name",
        nargs=1,
        default=None,
        help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option(
        "-c",
        "--control",
        dest="control",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam"
    )
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [
        inputFile for inputFile in options.input.split(',')
        if len(inputFile) > 1
    ]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][
                0:-4]  #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])

        else:
            print(
                'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    bamFileList = []
    if options.control:
        controlBamList = [
            bam for bam in options.control.split(',') if len(bam) > 0
        ]
        rankbyBamList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList * len(rankbyBamList)
        else:
            print(
                'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE'
            )
            sys.exit()
    else:
        bamFileList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()

    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))

    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0], '\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile, '\t')
            gffCollection = utils.gffToLocusCollection(gff, 50)
            inputLoci += gffCollection.getLoci()

        inputCollection = utils.LocusCollection(inputLoci, 50)
        inputCollection = inputCollection.stitchCollection(
        )  # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i, line in enumerate(inputGFF):

        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName, str(i + 1))  #1 indexing

        newLine = [
            chrom, lineID, lineID,
            min(coords),
            max(coords), '', sense, '', lineID
        ]
        formattedGFF.append(newLine)

    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome),
                                               inputName)
    utils.unParseTable(formattedGFF, masterGFFFile, '\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,
                                       inputName + '_MERGED_SIGNAL',
                                       controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (
        pipeline_dir, outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #42
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" 
    parser = OptionParser(usage = usage)

    #required flags                                                                                                                                        
    parser.add_option("-e","--enhancer_file", dest="enhancers",nargs = 1, default=None,
                      help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)")
    parser.add_option("-b","--bam",dest="bam",nargs =1, default = None,
                      help = "Provide a bam that corresponds to the super enhancer table")
    parser.add_option("-g","--genome",dest="genome",nargs =1, default = None,
                      help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "Enter an output folder")
    parser.add_option("-n","--name",dest="name",nargs =1, default = None,
                      help = "Provide a name for the job")

    #additional options                                                                                                               

    parser.add_option("-s","--subpeaks", dest="subpeaks",nargs=1,default=None,
                      help = "Enter a BED file of regions to search for motifs")                     
    parser.add_option("-x","--expCutoff", dest="expCutoff",nargs=1,default=33,
                      help = "Enter the expression cutoff to be used to define canidate TFs")
    parser.add_option("-l","--extension-length", dest="extension",nargs = 1, default=100,
                      help = "Enter the length to extend subpeak regions for motif finding")
    parser.add_option("-B","--background", dest="background",nargs = 1, default=None,
                      help = "Provide a background BAM file")
    parser.add_option("-a","--activity", dest="activity",nargs = 1, default=None,
                      help = "A table with refseq in the first column and activity (expression or promoter acetylation) in second")
    parser.add_option("-E","--enhancer_number", dest="Enumber",nargs = 1, default='super',
                      help = "Enter the number of top ranked enhancers to include in the anlaysis. Default is all super-enhancers")
    parser.add_option("-N", "--number", dest="number",nargs = 1, default=2,
                      help = "Enter the number of motifs required to assign a binding event")     #I have modified the destination of -N option so that it is different from the destination of -E option
    parser.add_option("--promoter", dest="promoter",nargs = 1, default=False,
                      help = "Enter True if the promoters should be included in the analysis")
    parser.add_option("--motifs", dest="motifs",nargs = 1, default=False,
                      help = "Enter an alternative PWM file for the analysis")
    parser.add_option("-t","--tfs", dest="tfs",nargs=1,default=None,
                      help = "Enter additional TFs (comma separated) to be used in the bindinf analysis")
    parser.add_option("-u","--ucsc", dest="is_ucsc", action='store_true', default=False,
                      help = "If set, use the ucsc folders or files with chromosome names as chr1, chr2, etc.")

    (options,args) = parser.parse_args()

    print(options)

    if options.enhancers and options.genome and options.output and options.name:

        ###
        # Define all global file names
        ###
        
        if options.motifs:
            motifDatabaseFile = options.motifs
        else:
            motifConvertFile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/MotifDictionary.txt'
            motifDatabaseFile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/VertebratePWMs.txt'


        # User input files
        enhancerFile = options.enhancers
        enhancerTable = utils.parseTable(enhancerFile, '\t')

        if options.bam:
            bamFile = options.bam
            bam = utils.Bam(bamFile)

        if options.background:
            background = options.background

        else: 
            background = None

        
        genome = options.genome
        genome = upper(genome)
        if genome == 'HG19':
            genomeDirectory = '/home/rad/packages/data/fasta/human/hg19/chromosomes/'
            annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/hg19_refseq.ucsc'
            TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_hg19.txt'

        if genome == 'HG18':
            genomeDirectory = '/grail/genomes/Homo_sapiens/human_gp_mar_06_no_random/fasta/'
            annotationFile = '/ark/home/cl512/src/pipeline/annotation/hg18_refseq.ucsc'
            TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_hg19.txt'

        if genome == 'MM9':
            genomeDirectory = '/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/'
            annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/mm9_refseq.ucsc'
            TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_mm9.txt'
        
        if genome == 'MM10':
            TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_mm10.txt'
            if options.is_ucsc:
                genomeDirectory = '/home/rad/packages/data/fasta/mouse/mm10/ucsc_chromosomes/'
                annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/ucsc/mm10_refseq.ucsc'
            else:
                genomeDirectory = '/home/rad/packages/data/fasta/mouse/mm10/chromosomes/'
                annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/mm10_refseq.ucsc'

        TFtable = utils.parseTable(TFfile, '\t')
        TFlist = [line[0] for line in TFtable]
        TFlistGene = [line[1] for line in TFtable]

        projectFolder = options.output
        projectName = options.name

        if options.subpeaks:
            subpeakFile = options.subpeaks
        else: subpeakFile = None

        refseqToNameDict = {}
        expressionFile = options.activity
        
        if expressionFile:
            expressionTable = utils.parseTable(expressionFile, '\t')

        else:
            expressionTable = calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background) 

        expCutoff = int(options.expCutoff)
        constExtension = int(options.extension)

        enhancerNumber = options.Enumber
        if options.Enumber != 'super':
            enhancerNumber = options.Enumber
        else:
            enhancerNumber = 'super'

        promoter = options.promoter
        additionalTFs = options.tfs
        number = options.number

        annotTable = utils.parseTable(annotationFile, '\t')
        for line in annotTable:
            gid = line[1]
            genename = upper(line[12])
            refseqToNameDict[gid] = genename    

        ###
        # Now run all the functions
        ###

        enhancerLoci = createEnhancerLoci(enhancerTable, enhancerNumber)
        expressedNM, expressionDictNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile)
        TFtoEnhancerDict = findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM,  bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter)

        # print TFtoEnhancerDict
        # sys.exit()



        formatOutput(TFtoEnhancerDict, refseqToNameDict, projectName, projectFolder)
        canidateGenes = [upper(refseqToNameDict[x]) for x in TFtoEnhancerDict.keys()]
        
        if additionalTFs:
            for tf in additionalTFs.split(','):
                canidateGenes.append(tf)
        canidateGenes = utils.uniquify(canidateGenes)

        print canidateGenes

        if subpeakFile == None:
            subpeakFile = findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2)
            
        generateSubpeakFASTA(TFtoEnhancerDict, subpeakFile, genomeDirectory, projectName, projectFolder, constExtension)        
        subpeakFile = projectFolder + projectName + '_SUBPEAKS.fa'

        findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile)
        graph = buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes)
        formatNetworkOutput(graph, projectFolder, projectName, canidateGenes)
        

    else:
        parser.print_help()
        sys.exit()
Пример #43
0
def geneToEnhancerDict(genome, enhancer_file, activity_path):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci 
    '''
    print('Identifying enhancers and target genes from %s' %(enhancer_file))
    #should this do gene assignment????
    #for now assume gene assignment has been done
    #can later toggle to do gene assignment

    #first load the TF lists

    tf_table = utils.parseTable(genome.returnFeature('tf_file'), '\t')

    motif_table = utils.parseTable(genome.returnFeature('motif_convert'),'\t')
    #this gives all tfs that have a motif
    motif_tfs = utils.uniquify([line[1] for line in motif_table])

    #intersect w/ the activity table
    if len(activity_path) > 0:
        activity_table = utils.parseTable(activity_path,'\t')
        #figure out the right column for actual gene names (basically not NM or NR and not a numeral)
        for i in range(len(activity_table[0])):
           # try: 
           #     foo = int(activity_table[0][i])
           # except ValueError: # case where it is not an integer
            if activity_table[0][i][0:2] != 'NM' and activity_table[0][i][0:2] != 'NR': #assumes refseq
                gene_col = i
                break
        print('using column %s of %s gene activity table for common names' % (gene_col + 1, activity_path))


        active_gene_list = [string.upper(line[gene_col]) for line in activity_table]

        tf_list_refseq = [line[0] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0]
        tf_list_name = utils.uniquify([line[1] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0])
    else:
        tf_list_refseq = [line[0] for line in tf_table if motif_tfs.count(line[1]) >0]
        tf_list_name = [line[1] for line in tf_table if motif_tfs.count(line[1]) >0]
    
    print('Identified %s TFs from %s that have motifs' % (len(tf_list_name),genome.returnFeature('tf_file')))

    #keyed by gene with loci objects in the list
    gene_to_enhancer_dict = defaultdict(list)
    enhancer_to_gene_dict = defaultdict(list)

    #assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output
    enhancer_table = utils.parseTable(enhancer_file,'\t')
    print('Analyzing %s cis-regulatory regions' % (len(enhancer_table)))

    #now let's make the enhancer table by region and then by gene
    enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']]
    enhancerTFTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']]
    geneTable = [['GENE','TF','CHROM','START','STOP','ENHANCER_ID']]
    geneTFTable = [['GENE','CHROM','START','STOP','ENHANCER_ID']]
    geneSummaryTable = [['GENE','TF','ENHANCER_LIST']]

    #will need to track which ones are TFs
    candidate_tf_list = []
    #find the columns for gene assignment
    header = enhancer_table[0]
    header_length = len(enhancer_table[0])
    closest_index = header.index('CLOSEST_GENE')
    proximal_index = header.index('PROXIMAL_GENES')
    overlap_index = header.index('OVERLAP_GENES')
    for line in enhancer_table[1:]:
        if len(line) != header_length: #don't bother trying to figure out lines w/o target genes
            continue
        enhancer_locus = utils.Locus(line[1],line[2],line[3],'.',line[0])
        closest_gene_list = line[closest_index].split(',')
        proximal_gene_list = line[proximal_index].split(',')
        overlap_gene_list = line[overlap_index].split(',')
        all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list
        all_gene_list = [string.upper(gene) for gene in all_gene_list]
        
        #print(all_gene_list)
        
        #print(activity_path)
        #print(active_gene_list)
        #gets a unique list of all tfs
        
        if len(activity_path) > 0:
            all_gene_list = utils.uniquify([gene for gene in all_gene_list if active_gene_list.count(gene) > 0])
        else:
            all_gene_list = utils.uniquify(all_gene_list)
        candidate_gene_list = utils.uniquify([gene for gene in all_gene_list if tf_list_name.count(gene) > 0])
        if len(all_gene_list) > 0:
            for gene in all_gene_list:

                gene_to_enhancer_dict[gene].append(enhancer_locus)
                enhancer_to_gene_dict[enhancer_locus].append(gene)
            newLine = line[0:4] + [','.join(all_gene_list)]
        else:
            newLine = line[0:4] + ['']
        enhancerTable.append(newLine)
        
        if len(candidate_gene_list) > 0:
            tfLine = line[0:4] + [','.join(candidate_gene_list)]
            enhancerTFTable.append(tfLine)
    

    #now iterate through each gene and list the enhancers
    gene_list = gene_to_enhancer_dict.keys()
    print(gene_list)
    gene_list.sort()
    for gene in gene_list:
        if tf_list_name.count(gene) > 0:
            tf_status = 1
            candidate_tf_list.append(gene)
        else:
            tf_status = 0
        enhancer_loci = gene_to_enhancer_dict[gene]
        enhancerString =','.join([enhancer.ID() for enhancer in enhancer_loci])
        geneSummaryTable.append([gene,tf_status,enhancerString])
        for enhancer in enhancer_loci:
            newLine = [gene,tf_status,enhancer.chr(),enhancer.start(),enhancer.end(),enhancer.ID()]
            geneTable.append(newLine)
            if tf_status == 1:
                newLine = [gene,enhancer.chr(),enhancer.start(),enhancer.end(),enhancer.ID()]
                geneTFTable.append(newLine)

    return geneTable,geneTFTable,enhancerTable,enhancerTFTable,geneSummaryTable,candidate_tf_list,gene_to_enhancer_dict
Пример #44
0
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000):

    """
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    # making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    # make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True)

    # bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, "\t")

    # make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formattedRankTable = [header]

    # the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    # the beds
    gainedTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0'
        % (genome, name2, genome, name2, name1)
    )
    gainedBed = [[gainedTrackHeader]]
    conservedTrackHeader = (
        'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0'
        % (genome, name1, name2, genome, name1, name2)
    )
    conservedBed = [[conservedTrackHeader]]

    lostTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0'
        % (genome, name1, genome, name1, name2)
    )
    lostBed = [[lostTrackHeader]]

    # the genes
    geneTable = [
        [
            "GENE",
            "ENHANCER_ID",
            "ENHANCER_CHROM",
            "ENHANCER_START",
            "ENHANCER_STOP",
            header[6],
            header[7],
            header[8],
            "STATUS",
        ]
    ]

    for line in rankEnhancerTable[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formattedRankTable.append(line)

        # getting the genes
        geneList = []
        geneList += line[9].split(",")
        geneList += line[10].split(",")
        geneList += line[11].split(",")
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ",")

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        # for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        # for conserved
        else:
            geneStatus = "CONSERVED"
            conservedBed.append(bedLine)

        # now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus]
            geneTable.append(geneTableLine)

    # concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(formattedRankTable, formattedFilename, "\t")

    # gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2))
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name2),
        window / 1000,
        window / 1000,
    )

    gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1))
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name1),
        window / 1000,
        window / 1000,
    )

    utils.unParseTable(gainedGFF, gffFilename_gained, "\t")
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t")

    utils.unParseTable(lostGFF, gffFilename_lost, "\t")
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t")

    # bed
    bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName)
    utils.unParseTable(fullBed, bedFilename, "\t")

    # geneTable
    geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(geneTable, geneFilename, "\t")

    # finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % (
        mergeFolder,
        name1,
        outputFolder,
        genome,
        mergeName,
    )
    os.system(cmd)

    # now execute the bamPlot_turbo.py commands
    bam1 = dataDict[name1]["bam"]
    bam2 = dataDict[name2]["bam"]
    bamString = "%s,%s" % (bam1, bam2)
    nameString = "%s,%s" % (name1, name2)
    colorString = "0,0,0:100,100,100"

    # change dir
    os.chdir("/ark/home/cl512/pipeline/")

    if len(gainedGFF) > 0:
        # gained command
        plotTitle = "%s_ONLY_SE" % (name2)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # gained window command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    if len(lostGFF) > 0:
        # lost command
        plotTitle = "%s_ONLY_SE" % (name1)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # lost command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    return
Пример #45
0
def findCanidateTFs(genome, enhancer_gff, expressedNM, expressionDictNM,
                    bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci 
    '''
    
    #loading in the enhancer gff regions
    enhancer_collection = utils.gffToLocusCollection(enhancer_gff)
    enhancer_loci = enhancer_collection.getLoci()


    #loading in the genome and TF info
    annot_file = genome.returnFeature('annot_file')
    startDict = utils.makeStartDict(annot_file)    

    tf_table = utils.parseTable(genome.returnFeature('tf_file'),'\t')
    refID_list = [line[0] for line in tf_table] #creates a list of all NM IDs for TFs

    #make a collection of all TF TSSs
    tssLoci = []
    for refID in refID_list:
        tssLoci.append(utils.makeTSSLocus(refID,startDict,0,0)) #this is a precise 1 coordinate TSS locus
    tssCollection = utils.LocusCollection(tssLoci,50)    



    enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']]

    gene_to_enhancer_dict = defaultdict(list)
    # Loop through enhancers
    #all gene nnames stored by refID
    for enhancer in enhancer_loci:
        

        # If the enhancer overlaps a TSS, save it
        overlapping_loci = tssCollection.getOverlap(enhancer, 'both')
        overlapping_refIDs =[locus.ID() for locus in overlapping_loci]

        # Find all gene TSS within 100 kb
        proximal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both')
        proximal_refIDs =[locus.ID() for locus in proximal_loci]
        
        # If no genes are within 100 kb, find the closest active gene within 1 million bp
        closest_refID = []
        if len(overlapping_refIDs) == 0 and len(proximal_refIDs) == 0:
        
            distal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both')
            distal_refIDs =[locus.ID() for locus in distal_loci]

            enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2
            distance_list = [abs(enhancerCenter - startDict[geneID]['start'][0])
                             for geneID in distal_refIDs]
            if len(distance_list) > 0:
                closest_refID = [distalGenes[distance_list.index(min(distance_list))]]

        #now we have all potential gene cases
        all_refIDs = overlappingGenes + proximalGenes + closest_refID
        
        #now we get all names and refIDs
        all_refIDs = utils.uniquify([refID for refID in all_refIDs if len(refID) > 0 ])
        all_names = utils.uniquify([startDict[refID]['name'] for refID in all_refIDs])
        
        #first do enhancer level assignment
        names_string = ','.join(all_names)
        enhancer_table.append([enhancer.ID(),enhancer.chr(),enhancer.start(),enhancer.end(),names_string])

        #now do gene level assignment
        for refID in all_refIDs:
            gene_to_enhancer_dict[refID].append(enhancer.ID())

        #an enhancer can be assigned to multiple genes
        #a promoter can only be assigned to 1 gene
        #promoters don't have enhancerIDs so don't add them yet
        #this should just be an enhancer level table
        #followed by a gene level table



        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)
 

        # If a TSS overlaps an enhancer, assign them together
        if overlappingGenes:
            for gene in overlappingGenes:
                if gene in tf_list:
                    TFtoEnhancerDict[gene].append(enhancer)
                    enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
                
        # Otherwise, assign the enhancer to the most active gene in 100 kb
        elif not overlappingGenes and proximalGenes:
            highestGene = ''
            highestActivity = 0
            for gene in proximalGenes:
                if expressionDictNM[gene] > highestActivity:
                    highestActivity = expressionDictNM[gene]
                    highestGene = gene
            if highestGene in TFlist:
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
            
        elif not overlappingGenes and not proximalGenes and closestGene:
            if closestGene in TFlist:
                gene = closestGene
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])

    # Add promoter is it's not contained in the super
    if promoter:
        for gene in TFtoEnhancerDict.keys():
            promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, 
                                   int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense'])
            overlapBool = False
            for enhancer in TFtoEnhancerDict[gene]:
                if promoter.overlaps(enhancer):
                    overlapBool = True
            if not overlapBool:
                TFtoEnhancerDict[gene].append(promoter)

    seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt'
    utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t')

    return TFtoEnhancerDict
Пример #46
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)

    outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    outputRank = outputRank.communicate()
    if len(outputRank[0]) > 0:  # test if mapping worked correctly
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.gff" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        outputControl = outputControl.communicate()
        if len(outputControl[0]) > 0:  # test if mapping worked correctly
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Пример #47
0
def makeGeneTable(peakTable, analysisName):
    '''
    takes the peak table and makes a gene centric table

    '''

    geneDict = {}

    geneTable = [[
        'GENE',
        '%s_TSS_SIGNAL' % (analysisName),
        '%s_DISTAL_SIGNAL' % (analysisName)
    ]]

    #now iterate through the table
    for line in peakTable[1:]:
        regionLength = int(line[4])

        signal = float(line[9]) * regionLength

        #genes where this particular peak overlaps the tss 1kb window
        #where there are both overlap and proximal meet
        if len(line) == 15:
            overlapGeneList = [
                gene for gene in line[-2].split(',') if len(gene) > 0
            ]
            if overlapGeneList.count('107'):
                print(line)
                sys.exit()
            for overlapGene in overlapGeneList:
                if geneDict.has_key(overlapGene) == False:
                    geneDict[overlapGene] = {'tss': 0.0, 'distal': 0.0}
                #there can be a nasty 1 overlap case where the region might overlap by the overlapping gene list, but not be real
                if int(line[5]) == 1:
                    geneDict[overlapGene]['tss'] += signal
                else:  #this is the case where the mycn site is just outside of the promoter or overlapping the gene locus/body these are rar
                    geneDict[overlapGene]['distal'] += signal

            proximalGeneList = [
                gene for gene in line[-1].split(',') if len(gene) > 0
            ]
            for proximalGene in proximalGeneList:
                if geneDict.has_key(proximalGene) == False:
                    geneDict[proximalGene] = {'tss': 0.0, 'distal': 0.0}
                if int(line[5]) == 0:
                    geneDict[proximalGene]['distal'] += signal
        #where there's just overlap
        if len(line) == 14:
            overlapGeneList = [
                gene for gene in line[-1].split(',') if len(gene) > 0
            ]
            if overlapGeneList.count('107'):
                print(line)
                sys.exit()
            for overlapGene in overlapGeneList:
                if geneDict.has_key(overlapGene) == False:
                    geneDict[overlapGene] = {'tss': 0.0, 'distal': 0.0}
                #there can be a nasty 1 overlap case where the region might overlap by the overlapping gene list, but not be real
                if int(line[5]) == 1:
                    geneDict[overlapGene]['tss'] += signal
                else:  #this is the case where the mycn site is just outside of the promoter or overlapping the gene locus/body these are rar
                    geneDict[overlapGene]['distal'] += signal

    geneList = geneDict.keys()
    geneList = utils.uniquify(geneList)
    geneList.sort()

    for gene in geneList:
        newLine = [gene]
        newLine.append(geneDict[gene]['tss'])
        newLine.append(geneDict[gene]['distal'])
        geneTable.append(newLine)

    return geneTable
Пример #48
0
def makeNESTable(nes_path_list,names_list,output =''):

    '''
    combines the GSEA NES output from the enhancerPromoter analysis
    creates a table of all represented gene sets
    '''

    if len(nes_path_list) != len(names_list):
        print('please provide the same number of nes table paths and sample names')
        sys.exit()

    #nested dictionaries gsea['pathway']['name][nes,fdr]
    gsea_dict = defaultdict(dict)

    pathway_list = []
    #iterate once to just get all potential pathways
    for i in range(len(nes_path_list)):

        nes_path = nes_path_list[i]
        nes = utils.parseTable(nes_path,'\t')
        for line in nes[1:]:
            pathway_list.append(line[0])


    pathway_list = utils.uniquify(pathway_list)
    pathway_list.sort()

    #now blank the dictionary w defaul NES of 0 and FDR of 1
    for pathway in pathway_list:
        for name in names_list:
            gsea_dict[pathway][name] = [0.0,1.0]

    #now loop again to fill out properly
    for i in range(len(nes_path_list)):

        nes_path = nes_path_list[i]
        name = names_list[i]
        nes = utils.parseTable(nes_path,'\t')
        
        for line in nes[1:]:
            if line[2] == 'NA':
                continue
            try:
                nes_vector = [float(line[2]),float(line[3])]
            except ValueError:
                print(line)
                print(nes_path)
                print(name)
                sys.exit()
            pathway = line[0]
            gsea_dict[pathway][name] = nes_vector


    #set up the output table
    header = ['PATHWAY']
    for name in names_list:
        header += ['%s_NES' % (name),'%s_FDR' % (name)]

    nes_table = [header]
    for pathway in pathway_list:
        nes_line = [pathway]
        for name in names_list:
            nes_line += gsea_dict[pathway][name]
        nes_table.append(nes_line)

    if len(output) != 0:
        utils.unParseTable(nes_table,output,'\t')
    else:
        return nes_table
Пример #49
0
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1,bam2)
        nameString = "%s,%s" % (name1,name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)
    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Пример #50
0
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(statOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]
    headerLength = len(rankEnhancerTable[0])
    for line in rankEnhancerTable[1:]:
        #fix line lengths
        if len(line) != headerLength:
            line += ['']*(headerLength-len(line))

        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[-1].split(',')
        geneList += line[-2].split(',')
        geneList += line[-3].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff
        #the cutoff is hard wired, but we can add an option to change the test
        #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff
        #print(line)
        
        if float(line[-8]) > cutOff and int(line[-4]) == 1:

            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'UNCHANGED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #formatted diff table
    #possible that no genes are differential
    rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t')
    
    
    #make a new formatted table
    header = rankEnhancerDiffTable[0]
    formattedRankDiffTable =[header]

    for line in rankEnhancerDiffTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankDiffTable.append(line)


    formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t')



    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)


    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        

        bamList1 = [dataDict[name]['bam'] for name in namesList1]
        bamList2 = [dataDict[name]['bam'] for name in namesList2]
        bamList = bamList1 + bamList2
        bamString = string.join(bamList,',')
        
        nameList = [name1]*len(namesList1) + [name2]*len(namesList2)
        nameString = string.join(nameList,',')
        print(namesList1[0])
        print(namesList2[0])

        print(namesList1)
        print(namesList2)
        print(dataDict[namesList1[0]]['color'])
        if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']:
            colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2)
        else:
            colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2)
        colorString = string.join(colorList,':')

        #change dir

    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return