Exemplo n.º 1
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================I. LOADING DATA ANNOTATION TABLES==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for chip data file
    pipeline_dfci.summary(chip_data_file)

    #for chip data file
    pipeline_dfci.summary(atac_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================II. CALLING ROSE2==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    macsEnrichedFolder = '%smacsEnriched/' % (
        projectFolder)  #folder with macs peak output beds
    parentFolder = utils.formatFolder(
        '%srose/' % (projectFolder),
        True)  # create a folder to store ROSE2 output
    namesList = ['MM1S_H3K27AC', 'MM1S_MED1'
                 ]  # calling ROSE2 on H3K27AC and MED1 defined enhancers
    bash_file = '%sMM1S_ROSE_CALLS.sh' % (parentFolder)
    mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % (
        projectFolder)
    pipeline_dfci.callRose2(chip_data_file,
                            macsEnrichedFolder,
                            parentFolder,
                            namesList,
                            extraMap=[],
                            inputFile='',
                            tss=2500,
                            stitch=12500,
                            bashFileName=bash_file,
                            mask=mask_file,
                            useBackground=True)
Exemplo n.º 2
0
def callRoseMerged(dataFile, mergedGFFFile, name1, name2, parentFolder):
    '''
    makes a rose call for the merged supers
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    backgroundName1 = dataDict[name1]['background']
    backgroundName2 = dataDict[name2]['background']
    if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2):
        hasBackground = True
    elif not dataDict.has_key(backgroundName1) and not dataDict.has_key(
            backgroundName2):
        hasBackground = False
    else:
        print "ERROR: Only 1 dataset has a background file. This is a very very bad idea"
        sys.exit()

    if hasBackground:
        namesList = [name1]
        extraMap = [name2, dataDict[name2]['background']]
    else:
        namesList = [name1]
        extraMap = [name2]

    return pipeline_dfci.callRose2(dataFile,
                                   '',
                                   parentFolder,
                                   namesList,
                                   extraMap,
                                   mergedGFFFile,
                                   tss=0,
                                   stitch=0)
Exemplo n.º 3
0
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder):

    '''
    makes a rose call for the merged supers
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    
    backgroundName1 = dataDict[name1]['background']
    backgroundName2 = dataDict[name2]['background']
    if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2):
        hasBackground = True
    elif not dataDict.has_key(backgroundName1) and not dataDict.has_key(backgroundName2):
        hasBackground =False
    else:
        print "ERROR: Only 1 dataset has a background file. This is a very very bad idea"
        sys.exit()


    if hasBackground:
        namesList = [name1]    
        extraMap = [name2,dataDict[name2]['background']]
    else:
        namesList = [name1]
        extraMap = [name2]


    return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0)
Exemplo n.º 4
0
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder,namesList1,namesList2,useBackground=False):

    '''
    makes a rose call for the merged supers
    '''

    #use the first column as a dummy, then load everything up into the extra map
    #
    
    roseBashFile = '%s%s_%s_rose.sh' % (parentFolder,name1,name2)
    dataDict = pipeline_dfci.loadDataTable(dataFile)

    #just set the first dataset of namesList1 so the code can run
    #all of the data will be in the extramap
    namesList = [namesList1[0]] 

    if useBackground:
        #first check that all datasets have a background
        backgroundList = []
        for name in namesList1 + namesList2:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                backgroundList.append(backgroundName)
            else:
                print "ERROR: No background dataset found for %s incompatible with --use-background flag" % (name)
                sys.exit()
        extraMap = namesList1 + namesList2 + backgroundList
    else:

        extraMap = namesList1 + namesList2


    return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0,bashFileName=roseBashFile,mask='',useBackground=False) #don't want additional background correction from the pipeline wrapper of rose
Exemplo n.º 5
0
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile):

    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict= pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder,True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds




    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:
        
        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap+=[name,backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name)
                sys.exit()
        else:
            extraMap+=[name]

    print extraMap
    
    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap,1,1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap


    
    bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) 
    
    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)


    if utils.checkOutput(mergedRegionMap,5,60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile)
        sys.exit()
Exemplo n.º 6
0
def map_shep_enhancers(shep_on_dataFile):
    '''
    for enhancers in individual systems defined by k27ac
    '''
    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)
    namesList = dataDict.keys()

    print(namesList)

    parentFolder = '%senhancer_rose' % (projectFolder)
    parentFolder = utils.formatFolder(parentFolder,True)

    bashFileName = '%senhancer_rose/shep_on_enhancer_rose.sh' %(projectFolder)

    namesList = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC']

    pipeline_dfci.callRose2(shep_on_dataFile,macsEnrichedFolder,parentFolder,namesList,[],'',2500,'',bashFileName,maskFile)

    return bashFileName
Exemplo n.º 7
0
def map_nb_enhancers(nb_all_chip_dataFile):
    '''
    for enhancers in individual systems defined by k27ac
    '''
    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    namesList = dataDict.keys()

    print(namesList)

    parentFolder = '%senhancer_rose' % (projectFolder)
    parentFolder = utils.formatFolder(parentFolder, True)

    bashFileName = '%senhancer_rose/nb_enhancer_rose.sh' % (projectFolder)

    namesList = [
        'SHEP21_0HR_H3K27AC_NOSPIKE', 'BE2C_H3K27AC', 'KELLY_H3K27AC',
        'NGP_H3K27AC'
    ]

    pipeline_dfci.callRose2(nb_all_chip_dataFile, macsEnrichedFolder,
                            parentFolder, namesList, [], '', 2500, '',
                            bashFileName, maskFile)

    return bashFileName
Exemplo n.º 8
0
def mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder,
                 maskFile):
    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder, True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder, analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds

    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:

        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap += [name, backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (
                    backgroundName, name)
                sys.exit()
        else:
            extraMap += [name]

    print extraMap

    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (
        outputFolder, namesList[0], gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap, 1, 1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap

    bashFileName = pipeline_dfci.callRose2(dataFile,
                                           '',
                                           roseParentFolder, [namesList[0]],
                                           extraMap,
                                           mergedGFFFile,
                                           0,
                                           0,
                                           bashFileName,
                                           mask=maskFile)

    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)

    if utils.checkOutput(mergedRegionMap, 5, 60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (
            mergedGFFFile)
        sys.exit()
Exemplo n.º 9
0
def launchEnhancerMapping(dataFile,
                          nameDict,
                          outputFolder,
                          roseFolder,
                          stitch,
                          tssDistance,
                          enhancerType,
                          maskFile=''):
    '''
    launches enhancer mapping if needed from enriched region files
    '''

    namesList = nameDict.keys()

    #check to see if everything is good, if so return True and call it a day
    if len([x for x in namesList
            if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList):
        print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS"
        return nameDict

    #if not, have to call rose

    roseOutputFolder = utils.formatFolder(roseFolder, True)

    queueList = []
    for name in namesList:

        #check to see if we need to call rose
        if nameDict[name]['enhancerFile'] == '':

            #get the enriched file
            enrichedFile = nameDict[name]['enrichedFile']
            #call rose
            print "CALLING ROSE FOR %s" % (name)
            bashFileName = pipeline_dfci.callRose2(dataFile,
                                                   '',
                                                   roseOutputFolder, [name],
                                                   [],
                                                   enrichedFile,
                                                   tssDistance,
                                                   stitch,
                                                   mask=maskFile)
            print bashFileName
            os.system('bash %s &' % (bashFileName))
            #add name to queue list
            queueList.append(name)

    #define the enhancer type
    if enhancerType == 'super':
        enhancerString = 'AllEnhancers.table.txt'
    if enhancerType == 'stretch':
        enhancerString = 'AllEnhancers_Length.table.txt'
    if enhancerType == 'superstretch':
        enhancerString = 'AllEnhancers_SuperStretch.table.txt'

    #now check for completion of datasets
    for name in queueList:

        #check for the AllEnhancers table
        enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder, name, name,
                                                  enhancerString)

        print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name, enhancerFile)
        if utils.checkOutput(enhancerFile, 1, 10):

            print "FOUND ENHANCER OUTPUT FOR %s" % (name)
            nameDict[name]['enhancerFile'] = enhancerFile
        else:

            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (roseOutputFolder, name)
            roseFileList = [x for x in os.listdir(roseFolder)
                            if x[0] != '.']  #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()
            enhancerFile = getFile(enhancerString, roseFileList, roseFolder)
            nameDict[name]['enhancerFile'] = enhancerFile

    return nameDict
Exemplo n.º 10
0
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile=''):

    '''
    launches enhancer mapping if needed from enriched region files
    '''

    namesList = nameDict.keys()

    #check to see if everything is good, if so return True and call it a day
    if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList):
        print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS"
        return nameDict

    #if not, have to call rose
    
    roseOutputFolder = utils.formatFolder(roseFolder,True)
    
    queueList =[]
    for name in namesList:

        #check to see if we need to call rose
        if nameDict[name]['enhancerFile'] == '':
     
            #get the enriched file
            enrichedFile = nameDict[name]['enrichedFile']
            #call rose
            print "CALLING ROSE FOR %s" % (name)
            bashFileName = pipeline_dfci.callRose2(dataFile,'',roseOutputFolder,[name],[],enrichedFile,tssDistance,stitch,mask=maskFile)
            print bashFileName
            os.system('bash %s &' % (bashFileName))
            #add name to queue list
            queueList.append(name)



    #define the enhancer type
    if enhancerType == 'super':
        enhancerString = 'AllEnhancers.table.txt'
    if enhancerType == 'stretch':
        enhancerString = 'AllEnhancers_Length.table.txt'
    if enhancerType == 'superstretch':
        enhancerString = 'AllEnhancers_SuperStretch.table.txt'



    #now check for completion of datasets
    for name in queueList:

        #check for the AllEnhancers table        
        enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder,name,name,enhancerString)
        

        print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile)
        if utils.checkOutput(enhancerFile,1,10):
            
            print "FOUND ENHANCER OUTPUT FOR %s" % (name)
            nameDict[name]['enhancerFile'] = enhancerFile
        else:

            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (roseOutputFolder,name)
            roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()
            enhancerFile = getFile(enhancerString,roseFileList,roseFolder)
            nameDict[name]['enhancerFile'] = enhancerFile

    return nameDict
Exemplo n.º 11
0
def wrapDRose(dataFile, name1, name2, analysis_name):
    '''
    wraps the delta rose analysis that will be done here using rose w/ 0 tss and 0 stitch
    '''

    #first call rose
    parentFolder = utils.formatFolder('%stwist1_rose/' % (projectFolder), True)

    #determine what the eventual output will look like
    enhancer_path_1 = '%s%s_ROSE/%s_peaks_AllEnhancers.table.txt' % (
        parentFolder, name1, name1)
    enhancer_path_2 = '%s%s_ROSE/%s_peaks_AllEnhancers.table.txt' % (
        parentFolder, name2, name2)

    if utils.checkOutput(enhancer_path_1, 0.1, 0.1) and utils.checkOutput(
            enhancer_path_2, 0.1, 0.1):
        print('Found ROSE2 output for %s and %s in %s' %
              (name1, name2, parentFolder))
    else:
        print('Running ROSE2 on %s and %s with -t 0 and -s 0 parameters')
        bashFileName = '%s%s_rose.sh' % (parentFolder, analysis_name)
        pipeline_dfci.callRose2(dataFile, macsEnrichedFolder, parentFolder,
                                [name1, name2], [], '', 0, 0, bashFileName,
                                maskFile, True)

        #os.system('bash %s' % (bashFileName))

    #next run dynamic rose

    dynamicFolder = utils.formatFolder('%sdynamic_rose/' % (projectFolder),
                                       True)
    rose_folder_1 = '%s%s_ROSE/' % (parentFolder, name1)
    rose_folder_2 = '%s%s_ROSE/' % (parentFolder, name2)
    bashFileName = '%s%s_dynamic.sh' % (dynamicFolder, analysis_name)
    bashFile = open(bashFileName, 'w')
    bashFile.write('#!/usr/bin/bash\n\n')

    bashFile.write('#dynamic rose on twist datasets for %s\n\n' %
                   (analysis_name))

    dynamic_cmd = 'python %sdynamicEnhancer.py -g %s -d %s -n %s,%s -r %s,%s -o %s%s/ -a' % (
        pipeline_dir, genome, dataFile, name1, name2, rose_folder_1,
        rose_folder_2, dynamicFolder, analysis_name)
    bashFile.write(dynamic_cmd + '\n\n')
    bashFile.close()

    rank_path = '%s%s/output/%s_%s_%s_merged_MERGED_ENHANCERS_RANK_TABLE.txt' % (
        dynamicFolder, analysis_name, genome.upper(), name1, name2)

    print(rank_path)
    if not utils.checkOutput(rank_path, 0.1, 0.1):
        #only run if you can't find the terminal output
        print('Running dynamic rose from %s' % (bashFileName))
        os.system('bash %s' % (bashFileName))

    if utils.checkOutput(rank_path, 1, 30):
        print('Found dynamic rose output at %s' % (rank_path))

        rank_table = utils.parseTable(rank_path, '\t')
        rank_gff = []
        for line in rank_table[1:]:
            gff_line = [
                line[1], line[0], '', line[2], line[3], '', '.', '', line[0]
            ]
            rank_gff.append(gff_line)

        rank_gff_path = '%s%s_%s_RANK.gff' % (gffFolder, genome.upper(),
                                              analysis_name)
        print('writing rank table as a gff to %s' % (rank_gff_path))
        utils.unParseTable(rank_gff, rank_gff_path, '\t')
        return rank_gff_path
    else:
        print(
            'Error: operation timed out. Cannot find expected dynamic output at %s'
            % (rank_path))
        sys.exit()