Пример #1
0
def run_macs(dataFile):
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0]
    namesList.sort()
    print(namesList)
    pipeline_dfci.callMacs(dataFile,macsFolder,namesList,overwrite=False,pvalue='1e-9')
    os.chdir(projectFolder) # the silly call macs script has to change into the output dir
    #so this takes us back to the project folder

    #to check for completeness, we will try to find all of the peak files
    peak_calling_done = False
    while not peak_calling_done:
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0]
        for name in namesList:
            peak_path = '%s%s/%s_summits.bed' % (macsFolder,name,name)
            print('searching for %s' % (peak_path))
            if utils.checkOutput(peak_path,1,180):
                peak_calling_done =True
                print('found %s' % (peak_path))
                continue
            else:
                print('Error: peak calling timed out')
                sys.exit()
    
    #now format the macs output
    print('formatting macs output')
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0]
    pipeline_dfci.formatMacsOutput(dataFile,macsFolder,macsEnrichedFolder,wiggleFolder,wigLink ='',useBackground=True)
    print('Finished running Macs 1.4.2')
Пример #2
0
def wrapGeneMapper(data_file,names_list=[],launch=True):

    '''
    runs ROSE2 GENE MAPPER on the AllEnhancers table
    '''

    data_dict = pipeline_dfci.loadDataTable(data_file)
    parent_rose_folder = utils.formatFolder('%srose_final' % (projectFolder),False)

    if len(names_list) ==0:
        names_list=[name for name in data_dict.keys() if name.upper().count('H3K27AC') ==1]

    #find each individual all enhancer table and then call the mapper via an .sh script
    for name in names_list:
        print(name)
        dataset_rose_folder = utils.formatFolder('%s%s_ROSE' %(parent_rose_folder,name),False)
        all_enhancer_path = '%s%s_peaks_AllEnhancers.table.txt' % (dataset_rose_folder,name)

        #print(all_enhancer_path)
        mapper_bash_path = '%s%s_geneMapper.sh' % (dataset_rose_folder,name)
        mapper_bash_file = open(mapper_bash_path,'w')
        mapper_bash_file.write('#!/usr/bin/bash\n\n\n\n')
        mapper_bash_file.write('#Running ROSE2 GENE MAPPER ON %s ALL ENHANCERS OUTPUT\n\n' % (name))
        mapper_cmd = 'python %sROSE2_geneMapper.py -g %s -i %s -f -w 100000' % (pipeline_dir,genome,all_enhancer_path)
        mapper_bash_file.write(mapper_cmd+'\n')
        mapper_bash_file.close()
        print('wrote gene mapper command to %s' % (mapper_bash_path))
        if launch:
            os.system('bash %s' % mapper_bash_path) 
Пример #3
0
def makeBoxPlot(dataFile,set_name,gff_name,names_list=[]):
        
    '''
    wrapping the boxplot script
    '''

    boxplot_script_path = '%sr_scripts/4_chiprx_plots.R' % (projectFolder)
    scale_table_path = '%sHG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (tableFolder)

    dataDict=  pipeline_dfci.loadDataTable(dataFile)
    dataFile_name = dataFile.split('/')[-1].split('.')[0]
    if len(names_list) == 0:
        names_list = [name for name in dataDict.keys() if name.count(set_name) > 0]
        names_list.sort()

    background_list = [ dataDict[name]['background'] for name in names_list]
    names_string = ','.join(names_list)
    background_string = ','.join(background_list)


    signal_table_path = '%sHG19_%s_%s_SIGNAL.txt' % (signalFolder,gff_name,dataFile_name)
    
    plot_name = '%s_%s' % (gff_name,set_name)
    r_cmd = 'Rscript %s %s %s %s %s %s %s' % (boxplot_script_path,signal_table_path,scale_table_path,names_string,background_string,plot_name,projectFolder)
    print(r_cmd)
    
    os.system(r_cmd)
Пример #4
0
def launchCRC(data_file,genome,dynamic_rose_output,group_name,group_list,crc_folder,activity_path):
    
    '''
    launches CRC analysis on all bams in a group w/ subpeaks
    #how do we get subpeaks piped through?
    '''
    dataDict = pipeline_dfci.loadDataTable(data_file)    
    bam_string = ','.join([dataDict[name]['bam'] for name in group_list])

    #set up the crc command
    crc_cmd = 'python CRC3.py -e %s -b %s -g %s -o %s -n %s' % (dynamic_rose_output,bam_string,genome.name(),crc_folder,group_name)

    if len(activity_path) > 0:
        crc_cmd += ' --activity %s' % (activity_path)
    bash_path = '%s%s_crc.sh' % (crc_folder,group_name)
    bash_file = open(bash_path,'w')

    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (whereAmI))
    bash_file.write(crc_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote CRC command for %s to %s' % (group_name,bash_path))
    print('Launching CRC')
    os.system('bash %s' % (bash_path))
Пример #5
0
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder,namesList1,namesList2,useBackground=False):

    '''
    makes a rose call for the merged supers
    '''

    #use the first column as a dummy, then load everything up into the extra map
    #
    
    roseBashFile = '%s%s_%s_rose.sh' % (parentFolder,name1,name2)
    dataDict = pipeline_dfci.loadDataTable(dataFile)

    #just set the first dataset of namesList1 so the code can run
    #all of the data will be in the extramap
    namesList = [namesList1[0]] 

    if useBackground:
        #first check that all datasets have a background
        backgroundList = []
        for name in namesList1 + namesList2:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                backgroundList.append(backgroundName)
            else:
                print "ERROR: No background dataset found for %s incompatible with --use-background flag" % (name)
                sys.exit()
        extraMap = namesList1 + namesList2 + backgroundList
    else:

        extraMap = namesList1 + namesList2


    return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0,bashFileName=roseBashFile,mask='',useBackground=False) #don't want additional background correction from the pipeline wrapper of rose
Пример #6
0
def callRoseMerged(dataFile, mergedGFFFile, name1, name2, parentFolder):
    '''
    makes a rose call for the merged supers
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    backgroundName1 = dataDict[name1]['background']
    backgroundName2 = dataDict[name2]['background']
    if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2):
        hasBackground = True
    elif not dataDict.has_key(backgroundName1) and not dataDict.has_key(
            backgroundName2):
        hasBackground = False
    else:
        print "ERROR: Only 1 dataset has a background file. This is a very very bad idea"
        sys.exit()

    if hasBackground:
        namesList = [name1]
        extraMap = [name2, dataDict[name2]['background']]
    else:
        namesList = [name1]
        extraMap = [name2]

    return pipeline_dfci.callRose2(dataFile,
                                   '',
                                   parentFolder,
                                   namesList,
                                   extraMap,
                                   mergedGFFFile,
                                   tss=0,
                                   stitch=0)
Пример #7
0
def make_nb_active_gene_lists(nb_all_chip_dataFile):

    pipeline_dfci.makeGeneGFFs(annotFile,gffFolder,species=genome.upper())


    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    setName = 'NB_TSS_H3K27AC'
    gffList = ['%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder)]
    cellTypeList = ['BE2C','KELLY','NGP','SHEP21']
    namesList = [name for name in dataDict.keys() if name.count('H3K27AC') == 1]

    pipeline_dfci.mapEnrichedToGFF(nb_all_chip_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,True,namesList,useBackground=True)

    #this is for the union
    mappedEnrichedFile = '%sHG19_TSS_ALL_-1000_+1000/HG19_TSS_ALL_-1000_+1000_NB_TSS_H3K27AC.txt' % (mappedEnrichedFolder)
    #this setList variable defines overlap logic for promoters. In this case, it's asking for the union of all datasets
    setList = [['BE2C_H3K27AC'],['KELLY_H3K27AC'],['NGP_H3K27AC'],['SHEP21_0HR_H3K27AC_NOSPIKE']]
    output = '%sgeneListFolder/HG19_NB_H3K27AC_ACTIVE_UNION.txt' % (projectFolder)
    pipeline_dfci.makeGFFListFile(mappedEnrichedFile,setList,output,annotFile)

    #this is for individual NB datasets
    namesList =['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE']
    for name in namesList:
        mappedEnrichedFile = '%sHG19_TSS_ALL_-1000_+1000/HG19_TSS_ALL_-1000_+1000_NB_TSS_H3K27AC.txt' % (mappedEnrichedFolder)
        setList = [[name]]
        output = '%sgeneListFolder/HG19_%s_ACTIVE.txt' % (projectFolder,name)
        pipeline_dfci.makeGFFListFile(mappedEnrichedFile,setList,output,annotFile)
Пример #8
0
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder):

    '''
    makes a rose call for the merged supers
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    
    backgroundName1 = dataDict[name1]['background']
    backgroundName2 = dataDict[name2]['background']
    if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2):
        hasBackground = True
    elif not dataDict.has_key(backgroundName1) and not dataDict.has_key(backgroundName2):
        hasBackground =False
    else:
        print "ERROR: Only 1 dataset has a background file. This is a very very bad idea"
        sys.exit()


    if hasBackground:
        namesList = [name1]    
        extraMap = [name2,dataDict[name2]['background']]
    else:
        namesList = [name1]
        extraMap = [name2]


    return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0)
Пример #9
0
def map_regions(dataFile,gffList,names_list=[]):

    '''
    making a normalized binding signal table at all regions
    '''

    #since each bam has different read lengths, important to carefully normalize quantification
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    dataFile_name = dataFile.split('/')[-1].split('.')[0]

    if len(names_list) == 0:
        names_list = dataDict.keys()
    names_list.sort()
    
    for name in names_list:
        bam = utils.Bam(dataDict[name]['bam'])
        read_length = bam.getReadLengths()[0]
        bam_extension = 200-read_length
        print('For dataset %s using an extension of %s' % (name,bam_extension))
        pipeline_dfci.mapBamsBatch(dataFile,gffList,mappedFolder,overWrite =False,namesList = [name],extension=bam_extension,rpm=True)

    #want a signal table of all datasets to each gff
    print('Writing signal tables for each gff:')
    for gffFile in gffList:
        gffName = gffFile.split('/')[-1].split('.')[0]
        signal_table_path = '%s%s_%s_SIGNAL.txt' % (signalFolder,gffName,dataFile_name)
        print(signal_table_path)
        pipeline_dfci.makeSignalTable(dataFile,gffFile,mappedFolder,namesList = names_list,medianNorm=False,output =signal_table_path)
Пример #10
0
    def wrap_enhancer_promoter(dataFile,
                               input_path,
                               activity_path,
                               analysis_name,
                               names_list=[],
                               useBackground=True):
        '''
        runs enhancer promoter on everybody with the conserved regions and union of active genes
        '''

        #hard coded paths
        tads_path = '%shESC_domains_hg19.bed' % (bedFolder)

        #setting the output folder
        ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder),
                                       True)

        dataDict = pipeline_dfci.loadDataTable(dataFile)
        if len(names_list) == 0:
            names_list = [name for name in dataDict.keys()]
            names_list.sort()

        bams_list = [dataDict[name]['bam'] for name in names_list]
        bams_string = ' '.join(bams_list)

        background_names = [
            dataDict[name]['background'] for name in names_list
        ]
        background_list = [
            dataDict[background_name]['bam']
            for background_name in background_names
        ]
        background_string = ' '.join(background_list)

        ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name)
        ep_bash = open(ep_bash_path, 'w')

        ep_bash.write('#!/usr/bin/bash\n\n\n')

        ep_bash.write('#enhancer promoter analysis for %s\n\n' %
                      (analysis_name))

        if useBackground:
            python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, background_string, genome.upper(),
                input_path, ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        else:
            python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, genome.upper(), input_path,
                ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        ep_bash.close()

        return (ep_bash_path)
Пример #11
0
def makeNameDict(dataFile,roseFolder,namesList=[]):

    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)
    
    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder,False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder,False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder,True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:
        namesList = [name for name in dataDict.keys() if dataDict[name]['background'] != 'NONE']
        #this filters out control WCE datatsets

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:
        
        nameDict[name] = {}
        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs'])
        print enrichedFile
        try:
            foo = open(enrichedFile,'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:

            roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name))
            allEnhancerFileList = [x for x in roseOutputFiles if x.count("AllEnhancers.table.txt") == 1 and x[0] != '.' ] #no weird hidden or temp files
            if len(allEnhancerFileList) > 0:
                nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0])
            else:
                nameDict[name]['enhancerFile'] = ''
        
        if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name)
            sys.exit()
    return nameDict
Пример #12
0
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile):

    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict= pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder,True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds




    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:
        
        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap+=[name,backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name)
                sys.exit()
        else:
            extraMap+=[name]

    print extraMap
    
    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap,1,1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap


    
    bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) 
    
    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)


    if utils.checkOutput(mergedRegionMap,5,60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile)
        sys.exit()
Пример #13
0
def wrapInvasionBox(data_file,
                    region_prefix,
                    set_name,
                    names_list=[],
                    top=5000,
                    scale_path=''):
    '''
    wrapper for the enhancer invasion boxplots
    '''

    invasion_script = '%sr_scripts/7_enhancer_invasion_plots.R' % (
        projectFolder)

    #set the scale path default
    if len(scale_path) == 0:
        scale_path = 'NONE'

    dataDict = pipeline_dfci.loadDataTable(data_file)
    if len(names_list) == 0:
        names_list = [
            name for name in dataDict.keys() if name.count('MYC') > 0
        ]
        names_list.sort()

    print('running enhancer invasion analysis on:')
    print(names_list)

    print('anchoring analysis on dataset: %s' % (names_list[0]))

    #need to get paths of the three peak tables
    #assumes formatting and naming conventions of the enhancerPromoter folder (see 5_nb_enhancer_promoter.py)
    peak_0_path = '%senhancerPromoter/%s_%s/%s_%s_PEAK_TABLE.txt' % (
        projectFolder, region_prefix, names_list[0], region_prefix,
        names_list[0])
    peak_1_path = '%senhancerPromoter/%s_%s/%s_%s_PEAK_TABLE.txt' % (
        projectFolder, region_prefix, names_list[1], region_prefix,
        names_list[1])
    peak_2_path = '%senhancerPromoter/%s_%s/%s_%s_PEAK_TABLE.txt' % (
        projectFolder, region_prefix, names_list[2], region_prefix,
        names_list[2])

    analysis_name = '%s_%s' % (region_prefix, set_name)
    print(analysis_name)

    sample_string = ','.join(names_list)
    print(sample_string)

    r_cmd = 'Rscript %s %s %s %s %s %s %s %s %s' % (
        invasion_script, peak_0_path, peak_1_path, peak_2_path, analysis_name,
        sample_string, top, projectFolder, scale_path)

    print(r_cmd)
    os.system(r_cmd)
Пример #14
0
def getMedianSignalEnhancer(enhancerFile, name, dataFile):
    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    enhancerVector = [float(line[6]) for line in enhancerTable[6:]]

    median = numpy.median(enhancerVector)

    return median
Пример #15
0
def define_enhancer_landscape(mouse_dataFile, analysisName, namesList=[]):
    '''
    define enhancers using h3k27ac in the 3 datasets that look good:
    CG, SCG, THMYCN_139076 using regular ROSE2
    '''

    #For SCG baseline
    #no TSS exclusion and no stitching

    dataDict = pipeline_dfci.loadDataTable(mouse_dataFile)

    if len(namesList) == 0:
        namesList = [
            name for name in dataDict.keys()
            if name.upper().count('H3K27AC') == 1
        ]

    bamFileList = [dataDict[name]['bam'] for name in namesList]
    bamString = string.join(bamFileList, ',')

    controlBams = [dataDict[name]['background'] for name in namesList]
    controlFileList = [dataDict[name]['bam'] for name in controlBams]
    controlBamString = string.join(controlFileList, ',')

    bedFileList = [
        macsEnrichedFolder + dataDict[name]['enrichedMacs']
        for name in namesList
    ]
    bedString = string.join(bedFileList, ',')

    outputFolder = '%s%s/' % (metaRoseFolder, analysisName)
    bashFileName = '%s%s_meta_rose.sh' % (metaRoseFolder, analysisName)

    bashFile = open(bashFileName, 'w')
    bashFile.write('#!/usr/bin/bash\n\n')
    bashFile.write('cd %s\n' % (pipeline_dir))

    metaRoseCmd = 'python %sROSE2_META.py -g mm9 -i %s -r %s -c %s -o %s -n %s' % (
        pipeline_dir, bedString, bamString, controlBamString, outputFolder,
        analysisName)

    bashFile.write(metaRoseCmd + '\n')
    bashFile.close()

    region_map_path = '%s%s/%s_AllEnhancers.table.txt' % (
        metaRoseFolder, analysisName, analysisName)

    #runs only if no output detected
    if not utils.checkOutput(region_map_path, 0, 0):
        print(bashFileName)
        os.system('bash %s' % (bashFileName))
    return bashFileName, region_map_path, namesList
Пример #16
0
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder):

    '''
    makes a rose call for the merged supers
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    namesList = [name1]    
    extraMap = [name2,dataDict[name2]['background']]


    return pipeline_dfci.callRose(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0)
Пример #17
0
def getSignalVector(regionFile, name, dataFile):
    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    regionTable = utils.parseTable(regionFile, '\t')

    bamPath = dataDict[name]['bam']
    bamName = bamPath.split('/')[-1]

    colID = regionTable[0].index(bamName)
    signalVector = [float(line[colID]) for line in regionTable[1:]]

    return signalVector
Пример #18
0
def getMedianSignalEnhancer(enhancerFile,name,dataFile):

    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    enhancerVector = [float(line[6]) for line in enhancerTable[6:]]
            

    median= numpy.median(enhancerVector)

    return median
Пример #19
0
def getSignalVector(regionFile,name,dataFile):

    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    regionTable = utils.parseTable(regionFile,'\t')

    bamPath = dataDict[name]['bam']
    bamName = bamPath.split('/')[-1]

    colID = regionTable[0].index(bamName)
    signalVector = [float(line[colID]) for line in regionTable[1:]]
            
    return signalVector
Пример #20
0
def getMedianSignal(enhancerFile, name, dataFile):

    """
    returns the median enhancer signal of a file
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile, "\t")

    backgroundName = dataDict[name]["background"]
    if dataDict.has_key(backgroundName):
        enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]]
    else:
        enhancerVector = [float(line[6]) for line in enhancerTable[6:]]

    median = numpy.median(enhancerVector)

    return median
Пример #21
0
def summarizeData(dataFile,output ='',namesList= []):

    dataDict=pipeline_dfci.loadDataTable(dataFile)

    if len(namesList) == 0:
        namesList = dataDict.keys()

    if len(output) == 0:
        output = string.replace(dataFile,'.txt','_SUMMARY.txt')

    print('WRITING OUTPUT TO %s' % (output))
    readTable = [['NAME','TOTAL_READS','MAPPED_READS','PEAKS']]

    for name in namesList:
        print('GETTING DATA SUMMARY FOR %s' % (name))

        uniqueID = dataDict[name]['uniqueID']

        mappedReads = round(float(pipeline_dfci.getTONYInfo(uniqueID,'67'))/1000000,2)
        totalRaw = pipeline_dfci.getTONYInfo(uniqueID,'68')
        totalRaw = int(totalRaw.split('::')[0])
        totalReads = round(float(totalRaw)/1000000,2)
        #mappedReads = 0
        #totalReads = 0

        #getting the spot score
        #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID)
        #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID)
        #spotTable = utils.parseTable(spotFile,'\t')
        #spotScore = spotTable[1][0].split(' ')[-1]

        #get the peak count
        if name.count('H3K27AC') == 1 or name.count('ATAC') ==1:
            peakCollection = utils.importBoundRegion('%s%s' % (macsEnrichedFolder,dataDict[name]['enrichedMacs']),name)
            peakCount = len(peakCollection)
        else:
            peakCount = 'NA'



        newLine = [name,totalReads,mappedReads,peakCount]
        print(newLine)
        readTable.append(newLine)


    utils.unParseTable(readTable,output,'\t')    
Пример #22
0
def define_enhancer_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile):

    '''
    defines the NB enhancer baseline using H3K27ac chips from NGP, KELLY, BE2C, and SHEP21
    enhancers defined using auto optimized stitching of nearby regions
    w/ a 2.5kb tss exclusion
    uses the meta rose code and writes out a .sh file for reproducibility
    '''

    #For H3K27AC
    #with TSS exclusion and auto stitching

    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    analysisName = 'NB_H3K27AC'
    namesList = [name for name in dataDict.keys() if name.count('H3K27AC') == 1]

    bamFileList = [dataDict[name]['bam'] for name in namesList]
    bamString = string.join(bamFileList,',')

    controlBams = [dataDict[name]['background'] for name in namesList]
    controlFileList = [dataDict[name]['bam'] for name in controlBams]
    controlBamString = string.join(controlFileList,',')

    bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList]
    bedString = string.join(bedFileList,',')

    roseFolder = '%smeta_rose/' % (projectFolder)
    roseFolder = utils.formatFolder(roseFolder,True)

    outputFolder = '%s%s/' % (roseFolder,analysisName)
    bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName)

    bashFile = open(bashFileName,'w')
    bashFile.write('#!/usr/bin/bash\n\n')
    bashFile.write('cd %s\n' % (pipeline_dir))

    metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 2500 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile)

    bashFile.write(metaRoseCmd + '\n')
    bashFile.close()


    #the 4KB parameter is 
    region_map_path = '%s%s/%s_AllEnhancers.table.txt' % (roseFolder,analysisName,analysisName)
    return bashFileName,region_map_path,namesList
Пример #23
0
def getMedianSignal(enhancerFile, name, dataFile):
    '''
    returns the median enhancer signal of a file
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    backgroundName = dataDict[name]['background']
    if dataDict.has_key(backgroundName):
        enhancerVector = [
            float(line[6]) - float(line[7]) for line in enhancerTable[6:]
        ]
    else:
        enhancerVector = [float(line[6]) for line in enhancerTable[6:]]

    median = numpy.median(enhancerVector)

    return median
Пример #24
0
    def wrapRose2Meta(data_file,
                      input_path,
                      parent_folder,
                      active_gene_path='',
                      rank_list=[],
                      control_list=[],
                      analysis_name=''):
        '''
        quick wrapper for Rose2Meta
        '''
        dataDict = pipeline_dfci.loadDataTable(data_file)
        rank_string = ','.join([dataDict[name]['bam'] for name in rank_list])
        control_string = ','.join(
            [dataDict[name]['bam'] for name in control_list])

        output_folder = utils.formatFolder(
            '%s%s' % (parent_folder, analysis_name), True)
        rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % (
            py27_path, pipeline_dir, genome, input_path, rank_string,
            control_string, analysis_name, output_folder, blacklist_path)

        all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder,
                                                             analysis_name)

        if active_gene_path != '':
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path,
                active_gene_path)
        else:
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path)

        rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name)
        rose_bash = open(rose_bash_path, 'w')
        rose_bash.write('#!/usr/bin/python\n\n')
        rose_bash.write('#setting up bamliquidator\n')

        rose_bash.write('\n\n#ROSE2_CMD\n')
        rose_bash.write(rose2_meta_cmd + '\n')
        rose_bash.write(rose2_map_cmd + '\n')

        rose_bash.close()
        print('Wrote ROSE2 META CMD to %s' % (rose_bash_path))
Пример #25
0
def map_shep_enhancers(shep_on_dataFile):
    '''
    for enhancers in individual systems defined by k27ac
    '''
    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)
    namesList = dataDict.keys()

    print(namesList)

    parentFolder = '%senhancer_rose' % (projectFolder)
    parentFolder = utils.formatFolder(parentFolder,True)

    bashFileName = '%senhancer_rose/shep_on_enhancer_rose.sh' %(projectFolder)

    namesList = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC']

    pipeline_dfci.callRose2(shep_on_dataFile,macsEnrichedFolder,parentFolder,namesList,[],'',2500,'',bashFileName,maskFile)

    return bashFileName
Пример #26
0
def map_for_heatmap(mouse_dataFile):
    '''
    to make quantification easier, all bams read lengths extended to 200

    '''

    dataDict = pipeline_dfci.loadDataTable(mouse_dataFile)

    #gff files
    nb_conserved_promoter_gff_5kb_file = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (
        gffFolder)
    nb_conserved_enhancer_gff_5kb_file = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (
        gffFolder)

    #setting the list of gff's to map
    gffList = [
        nb_conserved_promoter_gff_5kb_file,
        nb_conserved_enhancer_gff_5kb_file,
    ]
    cellTypeList = ['CG', 'SCG', 'THMYCN1', 'THMYCN2', 'THMYCN']
    mapList = [
        'CG_H3K27Ac',
        'SCG_H3K27Ac',
        'THMYCN1_H3K27Ac',
        'THMYCN_139423_H3K27Ac',
        'THMYCN_139076_H3K27Ac',
        'THMYCN2_MYCN',
        'THMYCN_139076_MYCN',
        'THMYCN_139423_MYCN',
    ]

    #for the non spike in
    #note, this data is 75bp reads
    pipeline_dfci.mapBams(mouse_dataFile,
                          cellTypeList,
                          gffList,
                          mappedFolder,
                          nBin=200,
                          overWrite=False,
                          rpm=True,
                          nameList=mapList,
                          extension=125)
Пример #27
0
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder):

    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict= pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds




    namesList = nameDict.keys()
    extraMap = []
    for name in namesList[1:]:
        
        backgroundName = dataDict[name]['background']
        extraMap+=[name,backgroundName]


    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName)
    if utils.checkOutput(mergedRegionMap,1,1):
        return mergedRegionMap



    bashFileName = pipeline_dfci.callRose(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName) 
    
    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)


    if utils.checkOutput(mergedRegionMap,5,60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile)
        sys.exit()
Пример #28
0
def define_myc_landscape(projectFolder,pipeline_dir,shep_on_dataFile):

    '''
    defines the myc baseline in shep on system across the union of all time points
    uses the meta rose code and writes out a .sh file for reproducibility
    '''

    #For MYC baseline
    #no TSS exclusion and no stitching

    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)
    analysisName = 'SHEP_ON_MYC'
    namesList = [name for name in dataDict.keys() if name.count('MYC') == 1]

    bamFileList = [dataDict[name]['bam'] for name in namesList]
    bamString = string.join(bamFileList,',')

    controlBams = [dataDict[name]['background'] for name in namesList]
    controlFileList = [dataDict[name]['bam'] for name in controlBams]
    controlBamString = string.join(controlFileList,',')

    bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList]
    bedString = string.join(bedFileList,',')

    roseFolder = '%smeta_rose/' % (projectFolder)
    roseFolder = utils.formatFolder(roseFolder,True)

    outputFolder = '%s%s/' % (roseFolder,analysisName)
    bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName)

    bashFile = open(bashFileName,'w')
    bashFile.write('#!/usr/bin/bash\n\n')
    bashFile.write('cd %s\n' % (pipeline_dir))

    metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 0 -s 0 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile)

    bashFile.write(metaRoseCmd + '\n')
    bashFile.close()

    #this is the expeceted region map output
    region_map_path = '%s%s/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt' % (roseFolder,analysisName,analysisName)
    return bashFileName,region_map_path,namesList
Пример #29
0
def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep on data
    '''

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile)
    names_list = dataDict.keys()

    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (mm1s_dataFile, bam_extension))

    #first do individuals
    for plot_group in ['MYC', 'H3K27AC']:
        plotList = [
            name for name in dataDict.keys() if name.count(plot_group) > 0
        ]
        plotName = '%s_MM1S_%s' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(mm1s_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')
Пример #30
0
def launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF):

    '''
    launches meta rose
    '''

    project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False)

    dataDict = pipeline_dfci.loadDataTable(data_file)

    genome_build = genome.name()
    
    meta_rose_output_1 = utils.formatFolder(meta_rose_folder + group1_name,True)
    meta_rose_output_2 = utils.formatFolder(meta_rose_folder + group2_name,True)

    meta_rose_string = ','.join([meta_rose_output_1,meta_rose_output_2])

    #setting the output
    dynamic_rose_folder = utils.formatFolder(dynamic_rose_folder,True)

    group1_string = ','.join(group1_list)
    group2_string = ','.join(group2_list)
    
    dynamic_cmd = 'python %sdynamicEnhancer_meta.py -g %s -d %s -r %s -o %s --group1 %s --group2 %s --name1 %s --name2 %s -p -m' % (pipeline_dir,genome_build,data_file,meta_rose_string,dynamic_rose_folder,group1_string,group2_string,group1_name,group2_name)
    
    if len(inputGFF) > 0:
        dynamic_cmd += ' --input %s' % (inputGFF)

    bash_path ='%s%s_dynamic_meta.sh' % (dynamic_rose_folder,analysis_name)
    bash_file = open(bash_path,'w')
    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (pipeline_dir))
    bash_file.write(dynamic_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote DYNAMIC_META command for %s to %s' % (analysis_name,bash_path))
    print('Launching DYNAMIC_META_ROSE')
    os.system('bash %s' % (bash_path))
Пример #31
0
def make_summary_table(data_file_list, output, bed_path=''):
    '''
    exports a table w/ name, million mapped reads and number of peaks
    '''

    print('WRITING SUMMARY OUTPUT TO %s' % (output))

    if bed_path != '':
        print('COPYING BEDS TO %s' % (bed_path))

    summary_table = [['NAME', 'READ_LENGTH', 'MAPPED_READS', 'PEAKS']]

    for data_file in data_file_list:
        print('GETTING DATA SUMMARY FOR %s' % (data_file))
        dataDict = pipeline_dfci.loadDataTable(data_file)

        names_list = dataDict.keys()
        names_list.sort()
        for name in names_list:
            print(name)
            uniqueID = dataDict[name]['uniqueID']

            bam = utils.Bam(dataDict[name]['bam'])
            read_length = bam.getReadLengths()[0]
            mmr = round(float(bam.getTotalReads()) / 1000000, 2)

            #get the peak count
            try:
                peak_path = '%s%s' % (macsEnrichedFolder,
                                      dataDict[name]['enrichedMacs'])
                peakCollection = utils.importBoundRegion(peak_path, name)
                peakCount = len(peakCollection)
            except IOError:
                peakCount = 'NA'

            newLine = [name, read_length, mmr, peakCount]
            #print(newLine)
            summary_table.append(newLine)

    utils.unParseTable(summary_table, output, '\t')
Пример #32
0
def map_shep_for_heatmap(shep_on_dataFile):

    '''
    map to both chiprx and regular chip bams
    to make quantification easier, all bams read lengths extended to 200

    '''

    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)

    #gff files

    shep_mycn_conserved_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder)

    shep_mycn_conserved_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder)

    shep_mycn_conserved_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder)

    #shep21 gff files
    shep21_mycn_conserved_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-5kb_+5kb.gff' % (gffFolder)

    shep21_mycn_conserved_promoter_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder)

    shep21_mycn_conserved_enhancer_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder)


    #setting the list of gff's to map
    gffList = [shep_mycn_conserved_gff_5kb_file,
               shep_mycn_conserved_promoter_gff_5kb_file,
               shep_mycn_conserved_enhancer_gff_5kb_file,
               shep21_mycn_conserved_gff_5kb_file,
               shep21_mycn_conserved_promoter_gff_5kb_file,
               shep21_mycn_conserved_enhancer_gff_5kb_file]
    cellTypeList = ['SHEP']
    mapList = [] # map everything

    #for the non spike in
    #note, this data is 75bp reads
    pipeline_dfci.mapBams(shep_on_dataFile,cellTypeList,gffList,mappedFolder,nBin = 200,overWrite =False,rpm=True,nameList = mapList,extension=125)
Пример #33
0
def launchMetaRose(group_name,group_list,meta_rose_folder,genome,data_file,stitch,tss):

    '''
    launches meta rose
    '''

    project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False)
    macs_folder = '%smacsEnriched/' % (project_folder) #quick hack to make sure input beds are in the right place
    dataDict = pipeline_dfci.loadDataTable(data_file)

    meta_rose_output = utils.formatFolder(meta_rose_folder + group_name,True)

    genome_build = genome.name()
    
    input_string = ','.join(['%s%s' % (macs_folder,dataDict[name]['enrichedMacs']) for name in group_list])
    bam_string = ','.join([dataDict[name]['bam'] for name in group_list])

    meta_cmd = 'python %sROSE2_META.py -g %s -i %s -r %s -o %s -n %s -t %s' % (pipeline_dir,genome_build,input_string,bam_string,meta_rose_output,group_name,tss)
    if stitch != None:
        meta_cmd += ' -s %s' % (stitch)

    #adding a mask if necessary
    if genome.hasFeature('mask'):
        meta_cmd += ' --mask %s' % (genome.returnFeature('mask'))

    bash_path ='%s%s_meta_rose.sh' % (meta_rose_output,group_name)
    bash_file = open(bash_path,'w')
    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (pipeline_dir))
    bash_file.write(meta_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote META_ROSE command for %s to %s' % (group_name,bash_path))
    print('Launching META_ROSE')
    os.system('bash %s' % (bash_path))
Пример #34
0
def map_nb_enhancers(nb_all_chip_dataFile):
    '''
    for enhancers in individual systems defined by k27ac
    '''
    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    namesList = dataDict.keys()

    print(namesList)

    parentFolder = '%senhancer_rose' % (projectFolder)
    parentFolder = utils.formatFolder(parentFolder, True)

    bashFileName = '%senhancer_rose/nb_enhancer_rose.sh' % (projectFolder)

    namesList = [
        'SHEP21_0HR_H3K27AC_NOSPIKE', 'BE2C_H3K27AC', 'KELLY_H3K27AC',
        'NGP_H3K27AC'
    ]

    pipeline_dfci.callRose2(nb_all_chip_dataFile, macsEnrichedFolder,
                            parentFolder, namesList, [], '', 2500, '',
                            bashFileName, maskFile)

    return bashFileName
Пример #35
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option("-d",
                      "--data",
                      dest="data",
                      nargs=1,
                      default=None,
                      help="Enter a data file for datasets to be processed")
    parser.add_option("-o",
                      "--output",
                      dest="output",
                      nargs=1,
                      default=None,
                      help="specify an output folder to write results to")

    #additional options
    parser.add_option(
        "-i",
        "--input",
        dest="input",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of names to analyze. Default will be all datasets"
    )

    parser.add_option("-n",
                      "--name",
                      dest="name",
                      nargs=1,
                      default=None,
                      help="Enter a name for the analysis")

    parser.add_option("-r",
                      "--rose",
                      dest="rose",
                      nargs=1,
                      default=None,
                      help="Enter a folder to detect or write rose output")

    parser.add_option(
        "-a",
        "--all",
        dest="all",
        action='store_true',
        default=False,
        help="flag to run analysis on ALL enhancers (this is much slower)")
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset"
    )
    parser.add_option(
        "-e",
        "--enhancer-type",
        dest="enhancer_type",
        nargs=1,
        default='super',
        help="specify type of enhancer to analyze: super, stretch, superStretch"
    )

    parser.add_option("-t",
                      "--tss",
                      dest="tss",
                      nargs=1,
                      default=2500,
                      help="specify a tss exclusion window. default is 2500bp")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        'Create a mask set of regions to filter out of analysis. must be .bed or .gff format'
    )

    (options, args) = parser.parse_args()

    print(options)
    print(args)

    if options.data and options.output:

        #check to see if minimum arguments are met

        #pull in arguments

        #pull in the datafile and create a datadict
        dataFile = options.data

        #now the output folder
        outputFolder = utils.formatFolder(
            options.output, True)  #check and create the output folder
        #now the rose folder
        if options.rose:
            roseFolder = options.rose
        else:
            roseFolder = "%srose/" % (outputFolder)

        if options.input:
            namesList = options.input.split(',')
        else:
            namesList = []

        #get the genome
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        genome = dataDict[dataDict.keys()[0]]['genome']

        #check if using only supers
        if options.all:
            superOnly = False
        else:
            superOnly = True

        #get the anlysis name
        if options.name:
            analysisName = options.name
        else:
            analysisName = "enhancers"

        #check for a stitching parameter
        if len(str(options.stitch)) > 0:
            stitch = str(options.stitch)
        else:
            stitch = ''

        #check for the tss parameter
        tssDistance = int(options.tss)

        #check enhancer type
        enhancerType = string.lower(options.enhancer_type)
        if ['super', 'superstretch', 'stretch'].count(enhancerType) == 0:
            print("ERROR: unsupported enhancer type %s" % (enhancerType))
            sys.exit()

        #see if there's a mask
        if options.mask:
            maskFile = options.mask
        else:
            maskFile = ''

        #=====================================================
        #=================SUMMARIZE INPUTS====================
        #=====================================================

        print "WORKING IN GENOME %s" % (genome)
        print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,
                                                           roseFolder)
        print "USING %s AS THE OUTPUT FOLDER" % (outputFolder)

        #=====================================================
        #==============ESTABLISH ALL WORKING FILES============
        #=====================================================

        print "\n\n\nESTABLISHING WORKING FILES"
        nameDict = makeNameDict(dataFile, roseFolder, namesList, enhancerType)

        print nameDict

        print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:"
        print nameDict.keys()

        for name in nameDict.keys():
            if len(nameDict[name]['enhancerFile']) == 0:
                print("NO ROSE OUTPUT FOR %s" % (name))

        #sys.exit()
        #=====================================================
        #==============LAUNCH ENHANCER MAPPING================
        #=====================================================

        print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)"
        nameDict = launchEnhancerMapping(dataFile, nameDict, outputFolder,
                                         roseFolder, stitch, tssDistance,
                                         enhancerType, maskFile)
        print nameDict

        #sys.exit()

        #=====================================================
        #====================GET MEDIAN SIGNAL================
        #=====================================================

        print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE"
        medianDict = makeMedianDict(nameDict)

        print medianDict
        #sys.exit()
        #=====================================================
        #====================MERGING ENHANCERS================
        #=====================================================

        print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS"

        mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder, genome,
                                               analysisName)
        mergedGFFFile = mergeCollections(nameDict, analysisName, mergedGFFFile,
                                         superOnly)

        #sys.exit()

        #=====================================================
        #===============MAP TO MERGED REGIONS=================
        #=====================================================

        print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS"
        mergedRegionMap = mapMergedGFF(dataFile, nameDict, mergedGFFFile,
                                       analysisName, outputFolder, maskFile)

        #=====================================================
        #==============CORRECT FOR MEDIAN SIGNAL==============
        #=====================================================

        print "\n\n\nCREATING ENHANCER SIGNAL TABLE"
        signalTableFile = makeEnhancerSignalTable(nameDict, mergedRegionMap,
                                                  medianDict, analysisName,
                                                  genome, outputFolder)

        #=====================================================
        #===============CALL CLUSTERING R SCRIPT==============
        #=====================================================

        print "\n\n\nGENERATING CLUSTERING OUTPUT"
        clusterTableFile = callRScript(genome, outputFolder, analysisName,
                                       signalTableFile)
        #output should be
        #png of cluster gram with rows as genes
        #png of cluster gram of samples w/ tree
        #ordered table w/ cluster assignment
        #similarity matrix for samples
        #sys.exit()
        #=====================================================
        #=============GENE MAPPING BY CLUSTER=================
        #=====================================================

        os.chdir('/ark/home/cl512/pipeline/')
        cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % (
            genome, clusterTableFile)
        os.system(cmd)

        print "FINISHED"

    else:
        parser.print_help()
        sys.exit()
Пример #36
0
def mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder,
                 maskFile):
    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder, True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder, analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds

    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:

        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap += [name, backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (
                    backgroundName, name)
                sys.exit()
        else:
            extraMap += [name]

    print extraMap

    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (
        outputFolder, namesList[0], gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap, 1, 1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap

    bashFileName = pipeline_dfci.callRose2(dataFile,
                                           '',
                                           roseParentFolder, [namesList[0]],
                                           extraMap,
                                           mergedGFFFile,
                                           0,
                                           0,
                                           bashFileName,
                                           mask=maskFile)

    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)

    if utils.checkOutput(mergedRegionMap, 5, 60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (
            mergedGFFFile)
        sys.exit()
Пример #37
0
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000):

    """
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    # making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    # make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True)

    # bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, "\t")

    # make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formattedRankTable = [header]

    # the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    # the beds
    gainedTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0'
        % (genome, name2, genome, name2, name1)
    )
    gainedBed = [[gainedTrackHeader]]
    conservedTrackHeader = (
        'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0'
        % (genome, name1, name2, genome, name1, name2)
    )
    conservedBed = [[conservedTrackHeader]]

    lostTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0'
        % (genome, name1, genome, name1, name2)
    )
    lostBed = [[lostTrackHeader]]

    # the genes
    geneTable = [
        [
            "GENE",
            "ENHANCER_ID",
            "ENHANCER_CHROM",
            "ENHANCER_START",
            "ENHANCER_STOP",
            header[6],
            header[7],
            header[8],
            "STATUS",
        ]
    ]

    for line in rankEnhancerTable[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formattedRankTable.append(line)

        # getting the genes
        geneList = []
        geneList += line[9].split(",")
        geneList += line[10].split(",")
        geneList += line[11].split(",")
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ",")

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        # for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        # for conserved
        else:
            geneStatus = "CONSERVED"
            conservedBed.append(bedLine)

        # now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus]
            geneTable.append(geneTableLine)

    # concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(formattedRankTable, formattedFilename, "\t")

    # gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2))
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name2),
        window / 1000,
        window / 1000,
    )

    gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1))
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name1),
        window / 1000,
        window / 1000,
    )

    utils.unParseTable(gainedGFF, gffFilename_gained, "\t")
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t")

    utils.unParseTable(lostGFF, gffFilename_lost, "\t")
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t")

    # bed
    bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName)
    utils.unParseTable(fullBed, bedFilename, "\t")

    # geneTable
    geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(geneTable, geneFilename, "\t")

    # finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % (
        mergeFolder,
        name1,
        outputFolder,
        genome,
        mergeName,
    )
    os.system(cmd)

    # now execute the bamPlot_turbo.py commands
    bam1 = dataDict[name1]["bam"]
    bam2 = dataDict[name2]["bam"]
    bamString = "%s,%s" % (bam1, bam2)
    nameString = "%s,%s" % (name1, name2)
    colorString = "0,0,0:100,100,100"

    # change dir
    os.chdir("/ark/home/cl512/pipeline/")

    if len(gainedGFF) > 0:
        # gained command
        plotTitle = "%s_ONLY_SE" % (name2)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # gained window command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    if len(lostGFF) > 0:
        # lost command
        plotTitle = "%s_ONLY_SE" % (name1)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # lost command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    return
Пример #38
0
import pipeline_dfci
import os
import time
import string

# ================================================================================
# ============================GLOBAL PARAMETERS===================================
# ================================================================================

# add locations of files and global parameters in this section


dataFile = "/home/clin/projects/131106_seComp/SE_TABLE_FORMATTED.txt"
genome = "hg18"

dataDict = pipeline_dfci.loadDataTable(dataFile)

# ================================================================================
# ===================================CLASSES======================================
# ================================================================================

# user defined classes here

# ================================================================================
# =================================FUNCTIONS======================================
# ================================================================================

# write your specific functions here


def makeSECollection(enhancerFile, name, top=0):
Пример #39
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter a data file for datasets to be processed")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "specify an output folder to write results to")

    #additional options
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to analyze. Default will be all datasets")

    parser.add_option("-n","--name", dest="name",nargs=1,default=None,
                      help = "Enter a name for the analysis")

    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a folder to detect or write rose output")

    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "flag to run analysis on ALL enhancers (this is much slower)")
    parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default='',
                      help = "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")

    parser.add_option("-t","--tss", dest="tss",nargs = 1, default=2500,
                      help = "specify a tss exclusion window. default is 2500bp")

    parser.add_option("--mask",dest="mask",nargs=1,default=None,
                      help = 'Create a mask set of regions to filter out of analysis. must be .bed or .gff format')


    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.data and options.output:

        #check to see if minimum arguments are met

        #pull in arguments
        
        #pull in the datafile and create a datadict
        dataFile = options.data

        #now the output folder
        outputFolder = utils.formatFolder(options.output,True) #check and create the output folder
        #now the rose folder
        if options.rose:
            roseFolder = options.rose
        else:
            roseFolder = "%srose/" % (outputFolder)

        if options.input:
            namesList = options.input.split(',')
        else:
            namesList = []

        #get the genome
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        genome = dataDict[dataDict.keys()[0]]['genome']

        #check if using only supers
        if options.all:
            superOnly = False
        else:
            superOnly = True

        #get the anlysis name
        if options.name:
            analysisName = options.name
        else:
            analysisName = "enhancers"
        
        #check for a stitching parameter
        if len(str(options.stitch)) > 0:
            stitch = str(options.stitch)
        else:
            stitch = ''

        #check for the tss parameter
        tssDistance = int(options.tss)

        #check enhancer type
        enhancerType = string.lower(options.enhancer_type)
        if ['super','superstretch','stretch'].count(enhancerType) == 0:
            print("ERROR: unsupported enhancer type %s" % (enhancerType))
            sys.exit()


        #see if there's a mask
        if options.mask:
            maskFile = options.mask
        else:
            maskFile = ''

        #=====================================================
        #=================SUMMARIZE INPUTS====================
        #=====================================================
        
        print "WORKING IN GENOME %s" % (genome)
        print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder)
        print "USING %s AS THE OUTPUT FOLDER" % (outputFolder)

        #=====================================================
        #==============ESTABLISH ALL WORKING FILES============
        #=====================================================

        print "\n\n\nESTABLISHING WORKING FILES"
        nameDict = makeNameDict(dataFile,roseFolder,namesList,enhancerType)

            
        print nameDict

        print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:"
        print nameDict.keys()

        for name in nameDict.keys():
            if len(nameDict[name]['enhancerFile']) == 0:
                print("NO ROSE OUTPUT FOR %s" % (name))
        
        #sys.exit()
        #=====================================================
        #==============LAUNCH ENHANCER MAPPING================
        #=====================================================
        
        print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)"
        nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile)
        print nameDict

        #sys.exit()

        #=====================================================
        #====================GET MEDIAN SIGNAL================
        #=====================================================
        
        print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE"
        medianDict = makeMedianDict(nameDict)

        print medianDict
        #sys.exit()
        #=====================================================
        #====================MERGING ENHANCERS================
        #=====================================================
        
        print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS"

        mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName)
        mergedGFFFile = mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly)

        #sys.exit()

        #=====================================================
        #===============MAP TO MERGED REGIONS=================
        #=====================================================

        print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS"
        mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile)

        #=====================================================
        #==============CORRECT FOR MEDIAN SIGNAL==============
        #=====================================================

        print "\n\n\nCREATING ENHANCER SIGNAL TABLE"
        signalTableFile = makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder)
      
        #=====================================================
        #===============CALL CLUSTERING R SCRIPT==============
        #=====================================================

        print "\n\n\nGENERATING CLUSTERING OUTPUT"
        clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile)
        #output should be
        #png of cluster gram with rows as genes
        #png of cluster gram of samples w/ tree
        #ordered table w/ cluster assignment
        #similarity matrix for samples
        #sys.exit()
        #=====================================================
        #=============GENE MAPPING BY CLUSTER=================
        #=====================================================

        os.chdir('/ark/home/cl512/pipeline/')
        cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % (genome,clusterTableFile)
        os.system(cmd)

        print "FINISHED"


    else:
        parser.print_help()
        sys.exit()
Пример #40
0
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(statOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]
    headerLength = len(rankEnhancerTable[0])
    for line in rankEnhancerTable[1:]:
        #fix line lengths
        if len(line) != headerLength:
            line += ['']*(headerLength-len(line))

        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[-1].split(',')
        geneList += line[-2].split(',')
        geneList += line[-3].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff
        #the cutoff is hard wired, but we can add an option to change the test
        #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff
        #print(line)
        
        if float(line[-8]) > cutOff and int(line[-4]) == 1:

            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'UNCHANGED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #formatted diff table
    #possible that no genes are differential
    rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t')
    
    
    #make a new formatted table
    header = rankEnhancerDiffTable[0]
    formattedRankDiffTable =[header]

    for line in rankEnhancerDiffTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankDiffTable.append(line)


    formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t')



    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)


    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        

        bamList1 = [dataDict[name]['bam'] for name in namesList1]
        bamList2 = [dataDict[name]['bam'] for name in namesList2]
        bamList = bamList1 + bamList2
        bamString = string.join(bamList,',')
        
        nameList = [name1]*len(namesList1) + [name2]*len(namesList2)
        nameString = string.join(nameList,',')
        print(namesList1[0])
        print(namesList2[0])

        print(namesList1)
        print(namesList2)
        print(dataDict[namesList1[0]]['color'])
        if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']:
            colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2)
        else:
            colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2)
        colorString = string.join(colorList,':')

        #change dir

    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Пример #41
0
def main():

    '''
    using argparse

    '''

    parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES')

    # required flags
    parser.add_argument("-d", "--data_table", dest="data_table", type=str,
                      help="input a data table with all datasets to be analyzed", required=True)
    parser.add_argument("-1", "--group1", dest="group1", type=str,
                      help="input a comma separated list of all datasets in group1", required=True)
    parser.add_argument("-2", "--group2", dest="group2", type=str,
                      help="input a comma separated list of all datasets in group2", required=True)


    #optional input override
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="input a gff of regions to analyze", required=False)


    #optional arguments
    parser.add_argument("-n", "--name", dest="name", type=str,
                      help="specify a name for the analysis. Default is drawn from the data table name", required=False)

    parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str,
                      help="Enter a name for group1.  Default is 'GROUP1'", required=False)
    parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str,
                      help="Enter a name for group2.  Default is 'GROUP2'", required=False)

    parser.add_argument("-a", "--activity", dest="activity", type=str,default='',
                      help="a table with active gene names in the first column", required=False)
    parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500,
                      help="Specify a TSS exclusion distance. Default is 2500", required=False)
    parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None,
                      help="Specify a stitching distance. Default is auto stitching", required=False)



    parser.add_argument("-o", "--output", dest="output", default='./',type=str,
                      help="Enter the output folder. Default is the current working directory", required=False)

    parser.add_argument("--log", dest="log", default='',type=str,
                      help="Enter a path to log output", required=False)



#     # DEBUG OPTION TO SAVE TEMP FILES
#     parser.add_argument("--scale", dest="scale", default='',
#                       help="Enter a comma separated list of scaling factors for your bams. Default is none")
#     parser.add_argument("--save-temp", dest="save", action='store_true', default=False,
#                       help="If flagged will save temporary files made by bamPlot")
#     parser.add_argument("--bed", dest="bed",
#                       help="Add a space-delimited list of bed files to plot")
#     parser.add_argument("--multi-page", dest="multi", action='store_true', default=False,
#                       help="If flagged will create a new pdf for each region")

    args = parser.parse_args()



    #now we can begin to parse the arguments
    
    #=====================================================================================
    #===============================I. PARSING ARGUMENTS==================================
    #=====================================================================================
    #pulling in the data table
    data_file = os.path.abspath(args.data_table)
    dataDict = pipeline_dfci.loadDataTable(data_file)

    #setting naming conventions
    if not args.name:
        analysis_name = data_file.split('/')[-1].split('.')[0]
    else:
        analysis_name = args.name

    #getting the optional input gff
    if args.input:
        inputGFF = args.input
    else:
        inputGFF = ''

    #getting group names
    group1_name = args.group1_name
    group2_name = args.group2_name

    #getting group1 
    group1_string = args.group1
    group1_list = [name for name in string.split(group1_string,',') if len(name) > 0]

    #getting group2
    group2_string = args.group2
    group2_list = [name for name in string.split(group2_string,',') if len(name) > 0]

    #checking that all datasets are in the data table
    for name in group1_list + group2_list:
        if name not in dataDict:
            print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file))
            sys.exit()

    #loading in the genome object from the data table
    genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list])
    if len(genome_list) > 1:
        print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.')
        sys.exit()

    
    #the load genome function has an assertion test to make sure the genome is supported
    genome = loadGenome(genome_list[0])

    
    parent_folder = utils.formatFolder(args.output,True)
    output_folder = utils.formatFolder(parent_folder + analysis_name,True)


    #these are the user defined optional arguments
    tss = int(args.tss)

    stitch = args.stitch
    print('stitch')
    print(stitch)

    
    #list of active genes to constrain analysis 
    if len(args.activity) == 0:
        #assumes all genes are active unless told otherwise
        #activity_path,activity_table = getActivity() # fix this function
        print('using all active genes')
    else:
        activity_path = args.activity
        activity_table = utils.parseTable(activity_path,'\t')




    print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n')

    print('Analyzing datasets described in %s\n' % (data_file))

    print('Name for the analysis: %s\n' % (analysis_name))
    print('Using genome: %s\n' % (genome.name()))


    
    print('%s datasets: %s\n' % (group1_name,group1_string))
    print('%s datasets: %s\n' % (group2_name,group2_string))

    if len(activity_path) > 0:
        print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path))
    else:
        print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name))
    print('Writing output to: %s\n' % (output_folder))


    #=====================================================================================
    #======================II. DEFINING CIS-REGULATORY ELEMENTS===========================
    #=====================================================================================


    print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n')



    #crc_wrapper will act at the group level and not consider individual datasets
    #since a data table is used as the input, the code will rely heavily on pipeline_dfci
    #embedded tools

    #1. first we need to run meta rose using default parameters and check the output
    #exists for each group

    meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True)

    group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name)

    group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name)
    #print(group1_output)
    #print(group2_output)

    #for each output check to see if they exist
    #if not launch

    try:
        foo = open(group1_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name))
        launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss)
        
    try:
        foo = open(group2_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name))
        launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss)



    #now check for completion
    if utils.checkOutput(group1_output,1,10):
        print('META_ROSE finished for %s' % (group1_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name))
        sys.exit()

    if utils.checkOutput(group2_output,1,10):
        print('META_ROSE finished for %s' % (group2_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name))
        sys.exit()


    #Meta rose does not give all regions that are SE in at least one sample
    #and can be blown out by amplicons etc...
    #sooo we need to run clustering to generate a good input gff
    #ideally we just rewrite dynamic meta to run off of clustering output
    #until we do that let's just overwrite w/ an input gff
    

    print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name))
    dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True)

    #here we will use the rank table as the primary output
    dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name)
    
    try:
        foo = open(dynamic_rose_output,'r')
    except IOError:
        print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name))
        launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF)

    if utils.checkOutput(dynamic_rose_output,1,10):
        print('DYNAMIC_ROSE finsihed for %s' % (analysis_name))
    else:
        print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name))
        sys.exit()




    #=====================================================================================
    #======================III. IDENTIFYING TF NODES IN NETWORK===========================
    #=====================================================================================


    print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n')




    #now we want to call circuitry on each group... ok to have different subpeaks and motif calls
    #if as a first approximation we weight by the overall enhancer




    crc_folder = utils.formatFolder('%scrc/' % (output_folder),True)



    #for all
    all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path)



    #for group1
    group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path)

    #for group2
    group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)
Пример #42
0
def makeNameDict(dataFile,roseFolder,namesList=[],enhancerType='super'):

    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)
    
    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)
    print "Using %s as the parent folder" % (parentFolder)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder,False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder,False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder,True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:


        namesList = [name for name in dataDict.keys() if string.upper(name).count('WCE') ==0 and string.upper(name).count('INPUT') == 0 ]
        #if no namesList is given, this filters out WCE 

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:
        
        nameDict[name] = {}

        #check if each dataset has a background

        backgroundName = dataDict[name]['background']
        if dataDict.has_key(backgroundName):
            nameDict[name]['background'] = True
        else:
            nameDict[name]['background'] = False

        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs'])
        
        print "Looking for macs output at %s" % (enrichedFile)

        try:
            foo = open(enrichedFile,'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:
            try:
                roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name))
                if enhancerType == 'super':
                    enhancerString = 'AllEnhancers.table.txt'
                if enhancerType == 'stretch':
                    enhancerString = 'AllEnhancers_Length.table.txt'
                if enhancerType == 'superstretch':
                    enhancerString = 'AllEnhancers_SuperStretch.table.txt'

                allEnhancerFileList = [x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files
                if len(allEnhancerFileList) > 0:
                    nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0])
                else:
                    nameDict[name]['enhancerFile'] = ''
            except OSError:
                nameDict[name]['enhancerFile']=''
        else:
            nameDict[name]['enhancerFile'] = ''
        
        if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name)
            print nameDict[name]
            sys.exit()
    return nameDict
Пример #43
0
def makeNameDict(dataFile, roseFolder, namesList=[], enhancerType='super'):
    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)
    print "Using %s as the parent folder" % (parentFolder)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder, False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder, False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder, True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:

        namesList = [
            name for name in dataDict.keys()
            if string.upper(name).count('WCE') == 0
            and string.upper(name).count('INPUT') == 0
        ]
        #if no namesList is given, this filters out WCE

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:

        nameDict[name] = {}

        #check if each dataset has a background

        backgroundName = dataDict[name]['background']
        if dataDict.has_key(backgroundName):
            nameDict[name]['background'] = True
        else:
            nameDict[name]['background'] = False

        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,
                                              dataDict[name]['enrichedMacs'])

        print "Looking for macs output at %s" % (enrichedFile)

        try:
            foo = open(enrichedFile, 'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:
            try:
                roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder, name))
                if enhancerType == 'super':
                    enhancerString = 'AllEnhancers.table.txt'
                if enhancerType == 'stretch':
                    enhancerString = 'AllEnhancers_Length.table.txt'
                if enhancerType == 'superstretch':
                    enhancerString = 'AllEnhancers_SuperStretch.table.txt'

                allEnhancerFileList = [
                    x for x in roseOutputFiles
                    if x.count(enhancerString) == 1 and x[0] != '.'
                ]  #no weird hidden or temp files
                if len(allEnhancerFileList) > 0:
                    nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (
                        roseFolder, name, allEnhancerFileList[0])
                else:
                    nameDict[name]['enhancerFile'] = ''
            except OSError:
                nameDict[name]['enhancerFile'] = ''
        else:
            nameDict[name]['enhancerFile'] = ''

        if nameDict[name]['enhancerFile'] == '' and nameDict[name][
                'enrichedFile'] == '':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (
                name)
            print nameDict[name]
            sys.exit()
    return nameDict
Пример #44
0
#pipeline_dfci.makePipelineTable(sampleTableFile,dirPath,bamPath,dataFile)

#dataDict = pipeline_dfci.loadDataTable(dataFile)

#namesList = dataDict.keys()

#print(namesList)

#==========================================================================
#=======================LOADING DATA ANNOTATION============================
#==========================================================================

##THIS SECTION LOADS A DATA TABLE.  MUST BE UNCOMMENTED FOR REST OF CODE TO WORK

#LOADING THE DATA TABLE
dataDict = pipeline_dfci.loadDataTable(dataFile)
print(dataDict.keys())

pipeline_dfci.summary(dataFile)

#==========================================================================
#==========================CALLING BOWTIE==================================
#==========================================================================

##THIS SECTION CALLS BOWTIE ON RAW READ FILES TO GENERATE SORTED AND INDEXED BAMS IN THE BAM FOLDER

#namesList = []  <- fill this in if you want to only map a subset of the data. otherwise leave blank

##SET LAUNCH TO False to debug
#pipeline_dfci.makeBowtieBashJobs(dataFile,namesList,launch=True)
Пример #45
0
def mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale):

    '''
    takes the rose output and merges signal
    '''
    print(roseOutput)
    initialMap = utils.parseTable(roseOutput,'\t')
    print(len(initialMap))
    output_merged = string.replace(roseOutput,'MAP.txt','MAP_MERGED.txt')
    output_norm = string.replace(roseOutput,'MAP.txt','MAP_NORM.txt')

    #one column for each signal

    name1Columns = range(0,len(namesList1),1)
    name2Columns = range(len(namesList1),len(namesList1+namesList2),1)
    if useBackground:
        name1BackgroundColumns = range(len(namesList1 +namesList2),len(namesList1 + namesList2 + namesList1),1)
        name2BackgroundColumns = range(len(namesList1 +namesList2+namesList1),len(namesList1 + namesList2 + namesList1 + namesList2),1)
    
    mergedMap = [initialMap[0][0:6] + ['%s_SIGNAL' % (name1),'%s_SIGNAL' % (name2)]]
    normMap = [initialMap[0][0:6] + namesList1 + namesList2]
    for line in initialMap[1:]: 

        signalVector = [float(x) for x in line[7:]]     #we ignore the 6th column
        if useBackground:
            name1Vector = [signalVector[i] for i in name1Columns]
            name1BackgroundVector = [signalVector[i] for i in name1BackgroundColumns]
            name1NormVector = numpy.subtract(name1Vector,name1BackgroundVector).tolist()
            #now zero out any negatives
            name1NormVector = [max(0,signal) for signal in name1NormVector]
            name1Signal = numpy.mean(name1NormVector)

            name2Vector = [signalVector[i] for i in name2Columns]
            name2BackgroundVector = [signalVector[i] for i in name2BackgroundColumns]
            name2NormVector = numpy.subtract(name2Vector,name2BackgroundVector).tolist()
            #now zero out any negatives
            name2NormVector = [max(0,signal) for signal in name2NormVector]
            name2Signal = numpy.mean(name2NormVector)
            
        else:
            name1Vector = [signalVector[i] for i in name1Columns]
            name1Signal = numpy.mean(name1Vector)

            name2Vector = [signalVector[i] for i in name2Columns]
            name2Signal = numpy.mean(name2Vector)
        mergeLine = line[0:6] + [name1Signal,name2Signal]
        mergedMap.append(mergeLine)
        normLine = line[0:6] + name1Vector + name2Vector
        normMap.append(normLine)


    if medianScale:
        #now we basically have to do the same thing to the region map for each one

        #this must have the correct name/background relationships as the original rose
        dataDict = pipeline_dfci.loadDataTable(dataFile) 
        medianDict = defaultdict(float)
        #can do this for each region map
        regionMap1 = roseDict1['RegionMap']
        regionMap2 = roseDict2['RegionMap']
        print(regionMap1)
        print(regionMap2)
        
        for name in namesList1:
            signalVector = getSignalVector(regionMap1,name,dataFile)
            if useBackground:
                backgroundName = dataDict[name]['background']
                backgroundVector = getSignalVector(regionMap1,backgroundName,dataFile)
                normVector = numpy.subtract(signalVector,backgroundVector).tolist()
                medianDict[name] = numpy.median(normVector)
            else:
                medianDict[name] = numpy.median(signalVector)

        #for second namesList must use regionMap2
        for name in namesList2:
            signalVector = getSignalVector(regionMap2,name,dataFile)
            if useBackground:
                backgroundName = dataDict[name]['background']
                backgroundVector = getSignalVector(regionMap2,backgroundName,dataFile)
                normVector = numpy.subtract(signalVector,backgroundVector).tolist()
                medianDict[name] = numpy.median(normVector)
            else:
                medianDict[name] = numpy.median(signalVector)
    
        #so here we only need to adjust the normMap
        for name in namesList1 + namesList2:

            medianSignal = medianDict[name]
            col = normMap[0].index(name)
            for row in range(1,len(normMap)):
                signal = float(normMap[row][col])
                normMap[row][col] = float(signal)/float(medianSignal)
        print(medianDict)


    utils.unParseTable(mergedMap,output_merged,'\t')
    utils.unParseTable(normMap,output_norm,'\t')

    


    return output_merged,output_norm
Пример #46
0
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1,bam2)
        nameString = "%s,%s" % (name1,name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)
    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Пример #47
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -d [DATA_FILE] -n [NAMES_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter a data file for datasets to be processed")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "specify an output folder to write results to")

    #additional options
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to analyze. Default will be all datasets")

    parser.add_option("-n","--name", dest="name",nargs=1,default=None,
                      help = "Enter a name for the analysis")

    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a folder to detect or write rose output")

    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "flag to run analysis on ALL enhancers (this is much slower)")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.data and options.output:

        #check to see if minimum arguments are met

        #pull in arguments
        
        #pull in the datafile and create a datadict
        dataFile = options.data




        #now the output folder
        outputFolder = utils.formatFolder(options.output,True) #check and create the output folder
        #now the rose folder
        if options.rose:
            roseFolder = options.rose
        else:
            roseFolder = "%srose/" % (outputFolder)

        if options.input:
            namesList = options.input.split(',')
        else:
            namesList = []

        #get the genome
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        genome = dataDict[dataDict.keys()[0]]['genome']

        #check if using only supers
        if options.all:
            superOnly = False
        else:
            superOnly = True

        #get the anlysis name
        if options.name:
            analysisName = options.name
        else:
            analysisName = "enhancers"

        #=====================================================
        #=================SUMMARIZE INPUTS====================
        #=====================================================
        
        print "WORKING IN GENOME %s" % (genome)
        print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder)
        print "USING %s AS THE OUTPUT FOLDER" % (outputFolder)
        print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:"
        print namesList

        #=====================================================
        #==============ESTABLISH ALL WORKING FILES============
        #=====================================================

        print "\n\n\nESTABLISHING WORKING FILES"
        nameDict = makeNameDict(dataFile,roseFolder,namesList)

            
        print nameDict
        
        #=====================================================
        #==============LAUNCH ENHANCER MAPPING================
        #=====================================================
        
        print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)"
        nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder)
        print nameDict


        #=====================================================
        #====================GET MEDIAN SIGNAL================
        #=====================================================
        
        print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE"
        medianDict = makeMedianDict(nameDict)

        print medianDict
        
        #=====================================================
        #====================MERGING ENHANCERS================
        #=====================================================
        
        print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS"
        mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName)
        mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly)


        #=====================================================
        #===============MAP TO MERGED REGIONS=================
        #=====================================================

        print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS"
        mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder)
        
        #=====================================================
        #==============CORRECT FOR MEDIAN SIGNAL==============
        #=====================================================

        print "\n\n\nCREATING ENHANCER SIGNAL TABLE"
        signalTableFile = makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder)
        #=====================================================
        #===============CALL CLUSTERING R SCRIPT==============
        #=====================================================

        print "\n\n\nGENERATING CLUSTERING OUTPUT"
        clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile)
        #output should be
        #png of cluster gram with rows as genes
        #png of cluster gram of samples w/ tree
        #ordered table w/ cluster assignment
        #similarity matrix for samples

        #=====================================================
        #=============GENE MAPPING BY CLUSTER=================
        #=====================================================

        os.chdir('/ark/home/cl512/rose/')
        cmd = 'python /ark/home/cl512/rose/ROSE_geneMapper.py -g %s -i %s' % (genome,clusterTableFile)
        os.system(cmd)

        print "FINISHED"


    else:
        parser.print_help()
        sys.exit()