Пример #1
0
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile):

    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict= pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder,True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds




    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:
        
        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap+=[name,backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name)
                sys.exit()
        else:
            extraMap+=[name]

    print extraMap
    
    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap,1,1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap


    
    bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) 
    
    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)


    if utils.checkOutput(mergedRegionMap,5,60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile)
        sys.exit()
Пример #2
0
def mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder):

    print("MAPPING TO THE FOLLOWING BAMS:")

    for bamFile in bamFileList:
        print(bamFile)
        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, analysisName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (splitGFFPath, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (splitGFFPath, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (splitGFFPath, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (splitGFFPath, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    
    #now we make a signal table
    #set up the table using the first bam
    if len(bamFileList) > 1:

        #set up the first pass at the table
        signalTable = [['REGION_ID','locusLine'] + [name.split('/')[-1] for name in bamFileList]]
        bamFileName = bamFileList[0].split('/')[-1]
        mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t')
        for i in range(1,len(mappedTable)):
            signalTable.append(mappedTable[i])

        for bamFile in bamFileList[1:]:
            bamFileName = bamFile.split('/')[-1]
            
            mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t')
            
            for i in range(1,len(mappedTable[i])):
                mapSignal = mappedTable[i][2]
                signalTable[i].append(mapSignal)
    else:
        bamFileName = bamFileList[0].split('/')[-1]
        signalTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t')

    return(signalTable)
def run_bash(bash_path, output_path, maxWait=30):
    '''
    runs a bash script and waits up to N minutes
    '''

    if not utils.checkOutput(output_path, 0, 0):

        print('running bash script %s' % (bash_path))
        os.system('bash %s' % (bash_path))
        if utils.checkOutput(output_path, 1, 30):
            print('run completed, output detected for %s at %s' %
                  (bash_path, output_path))
    else:
        print('found prior output for %s at %s' % (bash_path, output_path))
Пример #4
0
def makePeakGFFs(peak_path_list):

    '''
    makes a stitched gff for all MYC bound TSS and Distal regions across all datasets
    '''

    #setting the output
    tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder)
    distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder)

    #check to see if already done
    if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1):
        print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path))
        return tss_gff_path,distal_gff_path

    #emtpy loci lists to hold everything
    tss_loci = []
    distal_loci = []

    
    for peak_path in peak_path_list:
        print('processing %s' % (peak_path))

        peak_table=  utils.parseTable(peak_path,'\t')

        for line in peak_table[1:]:
            peak_locus = utils.Locus(line[1],line[2],line[3],'.')
            if int(line[5]) == 0:
                distal_loci.append(peak_locus)
            else:
                tss_loci.append(peak_locus)

    #now combind the loci
    print('stitching loci')
    distal_collection = utils.LocusCollection(distal_loci,50)
    tss_collection = utils.LocusCollection(tss_loci,50)

    stitched_distal_collection = distal_collection.stitchCollection()
    stitched_tss_collection = tss_collection.stitchCollection()

    #now make the gffs
    distal_gff= utils.locusCollectionToGFF(distal_collection)
    tss_gff= utils.locusCollectionToGFF(tss_collection)

    #now write to disk
    utils.unParseTable(distal_gff,distal_gff_path,'\t')
    utils.unParseTable(tss_gff,tss_gff_path,'\t')
    
    return tss_gff_path,distal_gff_path
Пример #5
0
def callR_GSEA(class1TablePath, class2TablePath, outputFolder, analysisName,
               top):
    '''
    function to call the Rscript and to wait until the .cls and .gct files are created
    returns the paths
    '''

    rBashFilePath = '%s%s_R_gsea.sh' % (outputFolder, analysisName)
    rBashFile = open(rBashFilePath, 'w')
    rBashFile.write('#!/usr/bin/bash\n\n')

    rCmd = 'Rscript %senhancerPromoter_gsea.R %s %s %s %s %s' % (
        pipeline_dir, class1TablePath, class2TablePath, outputFolder,
        analysisName, top)
    rBashFile.write(rCmd)
    rBashFile.close()
    print('writing R plotting command to disk and calling %s' %
          (rBashFilePath))
    os.system('bash %s' % (rBashFilePath))

    #now check for the nes output
    nesPath = '%s%s_top_%s_nes.txt' % (outputFolder, analysisName, top)

    if utils.checkOutput(nesPath, 0.5, 5):
        return
    else:
        print('ERROR: UNABLE TO SUCCESFULLY DETECT R SCRIPT OUTPUT AT %s' %
              (nesPath))
        sys.exit()
Пример #6
0
def CarRegTwoOptions(_actor, _failed=False):
    data = {'key': privKeys[_actor['account']]}
    writeDataBase(data, carFile)

    cmd = [CarExe, '--reg']
    _ret = runCmd(cmd, _miner=True)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    try:
        flag = mgmtContract.functions.cars(_actor["account"]).call()
    except ValueError as error:
        printError("Cannot call cars()", [error])
        return retval

    if not flag:
        printError("Got 'False' from contract", _ret[1])
        return retval

    if not _failed:
        forma = ['Registered successfully']
    else:
        forma = ['Already registered']
    return checkOutput(_ret[1], forma)
Пример #7
0
def run_macs(dataFile):
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0]
    namesList.sort()
    print(namesList)
    pipeline_dfci.callMacs(dataFile,macsFolder,namesList,overwrite=False,pvalue='1e-9')
    os.chdir(projectFolder) # the silly call macs script has to change into the output dir
    #so this takes us back to the project folder

    #to check for completeness, we will try to find all of the peak files
    peak_calling_done = False
    while not peak_calling_done:
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0]
        for name in namesList:
            peak_path = '%s%s/%s_summits.bed' % (macsFolder,name,name)
            print('searching for %s' % (peak_path))
            if utils.checkOutput(peak_path,1,180):
                peak_calling_done =True
                print('found %s' % (peak_path))
                continue
            else:
                print('Error: peak calling timed out')
                sys.exit()
    
    #now format the macs output
    print('formatting macs output')
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0]
    pipeline_dfci.formatMacsOutput(dataFile,macsFolder,macsEnrichedFolder,wiggleFolder,wigLink ='',useBackground=True)
    print('Finished running Macs 1.4.2')
Пример #8
0
def ScenterRegTwoOptions(_actor, _failed=False):
    setAcc(_actor, scenterFile)

    cmd = [ScenterExe, '--reg']
    _ret = runCmd(cmd, _miner=True)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    try:
        flag = mgmtContract.functions.serviceCenters(_actor["account"]).call()
    except ValueError as error:
        printError("Cannot call serviceCenters()", [error])
        return retval

    if not flag:
        printError("Got 'False' from contract", _ret[1])
        return retval

    if not _failed:
        forma = ['Registered successfully']
    else:
        forma = ['Already registered']
    return checkOutput(_ret[1], forma)
Пример #9
0
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder):

    '''
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    '''
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    

    #check to make sure this hasn't been done yet
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)

    try:
        foo = utils.parseTable(roseOutput,'\t')
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    except IOError:
        
        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2)
        mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile)

        #call rose on the merged shit    


        roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder)
        print('i can has rose bash file %s' % (roseBashFile))

        #run the bash command
        os.system('bash %s' % (roseBashFile))

        #check for and return output
        if utils.checkOutput(roseOutput,1,30):
            return roseOutput
        else:
            print "ERROR: ROSE CALL ON MERGED REGIONS FAILED"
            sys.exit()
Пример #10
0
def callRWaterfall(geneTablePath,outputFolder,analysisName,top):
    
    '''
    function to call the Rscript and to wait until the .cls and .gct files are created
    returns the paths
    '''

    rBashFilePath = '%s%s_R_plotting.sh' % (outputFolder,analysisName)
    rBashFile = open(rBashFilePath,'w')
    rBashFile.write('#!/usr/bin/bash\n\n')

    rCmd = 'R --no-save %s %s %s %s < %s/enhancerPromoter_waterfall.R' % (geneTablePath,outputFolder,analysisName,top,whereAmI)
    rBashFile.write(rCmd)
    rBashFile.close()
    print('writing R plotting command to disk and calling %s' %(rBashFilePath))
    os.system('bash %s' % (rBashFilePath))

    #now check for the .cls output
    clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top)

    if utils.checkOutput(clsPath,0.5,5):
        return 
    else:
        print('ERROR: UNABLE TO SUCCESFULLY DETECT R SCRIPT OUTPUT AT %s' % (clsPath))
        sys.exit()
Пример #11
0
def VendorRegisterNotUniqueAddressFail(_actor, _name, _value):
    walletContractAddress = mgmtContract.functions.walletContract().call()
    prev_balance = w3.eth.getBalance(walletContractAddress)

    cmd = [VendorExe, '--reg', _name, str(_value)]
    _ret = runCmdWithActor(cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long")
        return retval

    cur_balance = w3.eth.getBalance(walletContractAddress)

    if not (cur_balance - prev_balance == 0):
        printError("Wallet balance was changed", _ret[1])
        return retval

    try:
        deposit = mgmtContract.functions.vendorDeposit(
            _actor['account']).call()
    except ValueError as error:
        printError("Cannot call vendorDeposit()", [error])
        return retval

    retval = checkOutput(_ret[1], ['Failed. The vendor address already used.'])

    return retval
Пример #12
0
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF=''):

    '''
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    '''
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    

    #check to make sure this hasn't been done yet


    
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (parentFolder,namesList1[0],string.upper(genome),mergeName)

    if utils.checkOutput(roseOutput,.1,.1):
        
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    else:
        print("NO MERGED ROSE OUTPUT FOUND")
        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2)

        mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile,inputGFF)
        print('just merged gff')
        print(mergedGFF)
        #call rose on the merged regions
        roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder,namesList1,namesList2,useBackground)
        print('merged rose bash file %s' % (roseBashFile))

        #run the bash command
        os.system('bash %s' % (roseBashFile))

        #check for and return output
        if utils.checkOutput(roseOutput,1,10):
            return roseOutput
        else:
            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (parentFolder,namesList1[0])
            roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()

            roseOutput= getFile('_ENHANCER_REGION_MAP.txt',roseFileList,roseFolder)
            return roseOutput
Пример #13
0
def detectGSEAOutput(analysisName, outputFolder, top):
    '''
    tries to detect the .xls files that show up when GSEA is done running
    '''

    #first figure out the friggin output folder
    gseaParentFolder = '%sgsea_top_%s_c2/' % (outputFolder, top)

    for i in range(30):
        folderList = os.listdir(gseaParentFolder)
        #print(folderList)

        candidateFolderList = [
            folder for folder in folderList
            if folder.count('%s_top_%s.Gsea' % (analysisName, top)) == 1
        ]
        if len(candidateFolderList) > 1:
            print(
                'ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES'
                % (analysisName, string.upper(str(top))))
            sys.exit()
        elif len(candidateFolderList) == 0:
            time.sleep(10)
        elif len(candidateFolderList) == 1:
            candidateFolder = '%sgsea_top_%s_c2/%s/' % (outputFolder, top,
                                                        candidateFolderList[0])

    print('USING %s AS CANDIDATE GSEA FOLDER' % (candidateFolder))
    timeStamp = candidateFolder.split('.')[-1][:-1]
    print(timeStamp)
    #now that you have the candidate folder find the friggen xls files
    #for promoter
    promoterTablePath = '%sgsea_report_for_PROMOTER_%s.xls' % (candidateFolder,
                                                               timeStamp)
    distalTablePath = '%sgsea_report_for_DISTAL_%s.xls' % (candidateFolder,
                                                           timeStamp)
    print(promoterTablePath)
    print(distalTablePath)
    #now check em
    if utils.checkOutput(promoterTablePath, 0.5, 30):
        print('FOUND PROMOTER OUTPUT AT %s' % (promoterTablePath))
        if utils.checkOutput(distalTablePath, 0.5, 30):
            print('FOUND DISTAL OUTPUT AT %s' % (distalTablePath))
            return promoterTablePath, distalTablePath
    else:
        print('ERROR: UNABLE TO FIND GSEA OUTPUT')
Пример #14
0
def testVendorGetRegFee():
    cmd = [VendorExe, '--regfee']
    _ret = runCmd(cmd)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval
    return checkOutput(_ret[1], [f"Vendor registration fee: {currentRegFee}"])
Пример #15
0
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder):

    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict= pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds




    namesList = nameDict.keys()
    extraMap = []
    for name in namesList[1:]:
        
        backgroundName = dataDict[name]['background']
        extraMap+=[name,backgroundName]


    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName)
    if utils.checkOutput(mergedRegionMap,1,1):
        return mergedRegionMap



    bashFileName = pipeline_dfci.callRose(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName) 
    
    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)


    if utils.checkOutput(mergedRegionMap,5,60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile)
        sys.exit()
Пример #16
0
def VendorGetBatteryFee(_actor):
    cmd = [VendorExe, '--batfee']
    _ret = runCmdWithActor(cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval
    return checkOutput(_ret[1],
                       [f"Production fee per one battery: {currentBatFee}"])
Пример #17
0
def VendorDepositWrongAccount(_actor):
    cmd = [VendorExe, '--deposit']
    _ret = runCmdWithActor(cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    forma = ['Vendor account is not registered.']
    return checkOutput(_ret[1], forma)
Пример #18
0
def define_enhancer_landscape(mouse_dataFile, analysisName, namesList=[]):
    '''
    define enhancers using h3k27ac in the 3 datasets that look good:
    CG, SCG, THMYCN_139076 using regular ROSE2
    '''

    #For SCG baseline
    #no TSS exclusion and no stitching

    dataDict = pipeline_dfci.loadDataTable(mouse_dataFile)

    if len(namesList) == 0:
        namesList = [
            name for name in dataDict.keys()
            if name.upper().count('H3K27AC') == 1
        ]

    bamFileList = [dataDict[name]['bam'] for name in namesList]
    bamString = string.join(bamFileList, ',')

    controlBams = [dataDict[name]['background'] for name in namesList]
    controlFileList = [dataDict[name]['bam'] for name in controlBams]
    controlBamString = string.join(controlFileList, ',')

    bedFileList = [
        macsEnrichedFolder + dataDict[name]['enrichedMacs']
        for name in namesList
    ]
    bedString = string.join(bedFileList, ',')

    outputFolder = '%s%s/' % (metaRoseFolder, analysisName)
    bashFileName = '%s%s_meta_rose.sh' % (metaRoseFolder, analysisName)

    bashFile = open(bashFileName, 'w')
    bashFile.write('#!/usr/bin/bash\n\n')
    bashFile.write('cd %s\n' % (pipeline_dir))

    metaRoseCmd = 'python %sROSE2_META.py -g mm9 -i %s -r %s -c %s -o %s -n %s' % (
        pipeline_dir, bedString, bamString, controlBamString, outputFolder,
        analysisName)

    bashFile.write(metaRoseCmd + '\n')
    bashFile.close()

    region_map_path = '%s%s/%s_AllEnhancers.table.txt' % (
        metaRoseFolder, analysisName, analysisName)

    #runs only if no output detected
    if not utils.checkOutput(region_map_path, 0, 0):
        print(bashFileName)
        os.system('bash %s' % (bashFileName))
    return bashFileName, region_map_path, namesList
Пример #19
0
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,maskFile=''):

    '''
    launches enhancer mapping if needed from enriched region files
    '''

    namesList = nameDict.keys()

    #check to see if everything is good, if so return True and call it a day
    if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList):
        print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS"
        return nameDict

    #if not, have to call rose
    
    roseOutputFolder = utils.formatFolder(roseFolder,True)
    
    queueList =[]
    for name in namesList:

        #check to see if we need to call rose
        if nameDict[name]['enhancerFile'] == '':
     
            #get the enriched file
            enrichedFile = nameDict[name]['enrichedFile']
            #call rose
            print "CALLING ROSE FOR %s" % (name)
            bashFileName = pipeline_dfci.callRose(dataFile,'',roseOutputFolder,[name],[],enrichedFile,mask=maskFile)
            print bashFileName
            os.system('bash %s &' % (bashFileName))
            #add name to queue list
            queueList.append(name)

    #now check for completion of datasets
            
    for name in queueList:

        #check for the AllEnhancers table
        enhancerFile = "%s%s_ROSE/%s_peaks_AllEnhancers.table.txt" % (roseOutputFolder,name,name)
        

        print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile)
        if utils.checkOutput(enhancerFile,5,60):
            
            print "FOUND ENHANCER OUTPUT FOR %s" % (name)
            nameDict[name]['enhancerFile'] = enhancerFile
        else:
            print "UNABLE TO FIND ENHANCER OUTPUT FOR %s. QUITTING NOW" % (name)
            sys.exit()

    return nameDict
Пример #20
0
def testSetupCreateManagementContract(_actor, _fee):
    global mgmtContract, currentBatFee, currentRegFee
    currentBatFee = _fee
    currentRegFee = 1000 * currentBatFee

    _cmd = [SetupExe, '--setup', str(currentBatFee)]
    _ret = runCmdWithActor(_cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long")
        return retval

    db = openDataBase()
    if not ('mgmtContract' in db):
        printError('Database was not found or it is in incorrect format',
                   _ret[1])
    else:
        mgmtContractAddress = db['mgmtContract']
        mgmtContract = ManagementContract(w3, mgmtContractAddress)

        try:
            batContractAddress = mgmtContract.functions.batteryManagement(
            ).call()
        except ValueError as error:
            printError("Cannot call batteryManagement()", [error])
            return retval

        batContract = BatteryContract(w3, batContractAddress)

        try:
            erc20ContractAddress = batContract.functions.erc20().call()
        except ValueError as error:
            printError("Cannot call erc20()", [error])
            return retval

        try:
            walletContractAddress = mgmtContract.functions.walletContract(
            ).call()
        except ValueError as error:
            printError("Cannot call erc20()", [error])
            return retval

        forma = 'Management contract: {}\nWallet contract: {}\nCurrency contract: {}'
        forma = forma.format(mgmtContractAddress, walletContractAddress,
                             erc20ContractAddress).split('\n')

        retval = checkOutput(_ret[1], forma)
    return retval
Пример #21
0
def CarGetAccount(_actor):
    data = {'key': privKeys[_actor['account']]}
    writeDataBase(data, carFile)

    cmd = [CarExe, "--account"]
    _ret = runCmd(cmd)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    return checkOutput(_ret[1], [_actor['account']])
Пример #22
0
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName,
                    genome, parentFolder):
    '''
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    '''
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (
        parentFolder, string.upper(genome), mergeName)

    #check to make sure this hasn't been done yet
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (
        parentFolder, name1, string.upper(genome), mergeName)

    try:
        foo = utils.parseTable(roseOutput, '\t')
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    except IOError:

        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,
                                                           superFile2)
        mergedGFF = mergeCollections(superFile1, superFile2, name1, name2,
                                     mergedGFFFile)

        #call rose on the merged shit

        roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2,
                                      parentFolder)
        print('i can has rose bash file %s' % (roseBashFile))

        #run the bash command
        os.system('bash %s' % (roseBashFile))

        #check for and return output
        if utils.checkOutput(roseOutput, 1, 10):
            return roseOutput
        else:
            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (parentFolder, name1)
            roseFileList = [x for x in os.listdir(roseFolder)
                            if x[0] != '.']  #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()

            enhancerToGeneFile = getFile(
                '_SuperEnhancers_ENHANCER_TO_GENE.txt', roseFileList,
                roseFolder)
Пример #23
0
def CarNewAccount():
    cmd = [CarExe, "--new"]
    _ret = runCmd(cmd)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    data = openDataBase(carFile)
    if 'key' not in data:
        printError("Incorrect database", _ret[1])
        return retval

    return checkOutput(_ret[1], [privtopub(data['key'])])
Пример #24
0
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder):

    """
    this is the main run function for the script
    all of the work should occur here, but no functions should be defined here
    """
    mergedGFFFile = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % (parentFolder, string.upper(genome), mergeName)

    # check to make sure this hasn't been done yet
    roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (
        parentFolder,
        name1,
        string.upper(genome),
        mergeName,
    )

    try:
        foo = utils.parseTable(roseOutput, "\t")
        print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput)
        return roseOutput
    except IOError:

        print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2)
        mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile)

        # call rose on the merged shit

        roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder)
        print ("i can has rose bash file %s" % (roseBashFile))

        # run the bash command
        os.system("bash %s" % (roseBashFile))

        # check for and return output
        if utils.checkOutput(roseOutput, 1, 10):
            return roseOutput
        else:
            # try finding it w/ a different name
            # this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (parentFolder, name1)
            roseFileList = [x for x in os.listdir(roseFolder) if x[0] != "."]  # no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()

            enhancerToGeneFile = getFile("_SuperEnhancers_ENHANCER_TO_GENE.txt", roseFileList, roseFolder)
Пример #25
0
def VendorDeposit(_actor):
    cmd = [VendorExe, '--deposit']
    _ret = runCmdWithActor(cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    try:
        cur_deposit = mgmtContract.functions.vendorDeposit(
            _actor['account']).call()
    except ValueError as error:
        printError("Cannot call vendorDeposit()", [error])
        return retval

    forma = ['Deposit: {}'.format(w3.fromWei(cur_deposit, 'ether'))]
    return checkOutput(_ret[1], forma)
Пример #26
0
def VendorGetBatteryFeeAfterChange(_setfeeactor, _actor, _newfee):
    _cmd = [SetupExe, '--setfee', str(_newfee)]
    _ret = runCmdWithActor(_cmd, _setfeeactor)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    _cmd = [VendorExe, '--batfee']
    _ret = runCmdWithActor(_cmd, _actor, True)

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    return checkOutput(_ret[1],
                       [f"Production fee per one battery: {currentBatFee}"])
Пример #27
0
def VendorRegisterNewVendorSuccess(_actor, _name, _value):
    walletContractAddress = mgmtContract.functions.walletContract().call()
    prev_balance = w3.eth.getBalance(walletContractAddress)

    cmd = [VendorExe, '--reg', _name, str(_value)]
    _ret = runCmdWithActor(cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long")
        return retval

    cur_balance = w3.eth.getBalance(walletContractAddress)
    wei_value = w3.toWei(_value, 'ether')

    if not (cur_balance - prev_balance == wei_value):
        printError("Wallet balance was not changed", _ret[1])
        return retval

    try:
        deposit = mgmtContract.functions.vendorDeposit(
            _actor['account']).call()
    except ValueError as error:
        printError("Cannot call vendorDeposit()", [error])
        return retval

    if deposit != wei_value:
        printError("Deposit differs", _ret[1])
        return retval

    try:
        vendId = mgmtContract.functions.vendorId(_actor['account']).call()
    except ValueError as error:
        printError("Cannot call vendorDeposit()", [error])
        return retval

    forma = 'Success.\nVendor ID: {}'.format(str(w3.toHex(vendId))[2:])
    forma = forma.split('\n')
    retval = checkOutput(_ret[1], forma)

    return retval
Пример #28
0
def callRScript(genome, outputFolder, analysisName, signalTableFile):
    '''
    calls the R script to do clustering and heatmap
    '''

    clusterTable = "%s%s_%s_clusterTable.txt" % (outputFolder, genome,
                                                 analysisName)

    rCmd = 'R --no-save %s %s %s %s < /ark/home/cl512/pipeline/clusterEnhancer.R' % (
        genome, outputFolder, analysisName, signalTableFile)
    print("Calling command %s" % rCmd)

    os.system(rCmd)

    print "Checking for cluster table output at %s" % (clusterTable)
    if utils.checkOutput(clusterTable, 1, 30):

        return clusterTable
    else:
        print "ERROR: CLUSTERING TABLE FAILED TO GENERATE"
        sys.exit()
Пример #29
0
def callRScript(genome,outputFolder,analysisName,signalTableFile):

    '''
    calls the R script to do clustering and heatmap
    '''
            
            
    clusterTable = "%s%s_%s_clusterTable.txt" % (outputFolder,genome,analysisName)

    rCmd = 'R --no-save %s %s %s %s < /ark/home/cl512/pipeline/clusterEnhancer.R' % (genome,outputFolder,analysisName,signalTableFile)
    print("Calling command %s" % rCmd)

    os.system(rCmd)

    print "Checking for cluster table output at %s" % (clusterTable)
    if utils.checkOutput(clusterTable,1,30):

        return clusterTable
    else:
        print "ERROR: CLUSTERING TABLE FAILED TO GENERATE"
        sys.exit()
Пример #30
0
def VendorOwnerOneBattery(_actor, _failed=False):
    global exist_batId, batContract
    _newOwner = _actor['account']
    _batId = exist_batId
    cmd = [VendorExe, '--owner', delHexPrefix(w3.toHex(_batId)), _newOwner]
    _ret = runCmdWithActor(cmd, ownerVendor)

    retval = False

    if not _ret[0]:
        print(_ret[1])
        print("The command executed too long or there is nothing on output")
        return retval

    try:
        vendAddr = batContract.functions.vendorOf(_batId).call()
    except ValueError as error:
        printError("Cannot call vendorOf()", [error])
        return retval

    try:
        ownerAddr = batContract.functions.ownerOf(_batId).call()
    except ValueError as error:
        printError("Cannot call ownerOf()", [error])
        return retval
    forma = []
    if not _failed:
        forma = ['Success']
        if (vendAddr == '0x' + '0' * 40) or (_newOwner != ownerAddr):
            printError("Incorrect info in Battery management contract",
                       _ret[1])
            return retval
    else:
        forma = ['Failed. Not allowed to change ownership.']
        if (vendAddr == '0x' + '0' * 40) or (_newOwner == ownerAddr):
            printError("Incorrect info in Battery management contract",
                       _ret[1])
            return retval

    return checkOutput(_ret[1], forma)
Пример #31
0
def testVendorNewAccount():
    prev_acc_list = w3.eth.accounts

    cmd = [VendorExe, "--new", "'New_password'"]
    ret = runCmd(cmd)

    retval = False

    if not ret[0]:
        print("The command executed too long")
        return retval

    new_acc_list = w3.eth.accounts

    diff = set(prev_acc_list) ^ set(new_acc_list)
    if len(diff) == 0:
        printError("New account not found on the node", ret[1])
    else:
        account = list(diff)[0]
        retval = checkOutput(ret[1], [account])

    return retval
Пример #32
0
def ScenterNewAccount():
    prev_acc_list = w3.eth.accounts

    cmd = [ScenterExe, "--new", "'New_password'"]
    _ret = runCmd(cmd)

    retval = False

    if not _ret[0]:
        print("The command executed too long or there is nothing on output")
        return retval

    new_acc_list = w3.eth.accounts

    diff = set(prev_acc_list) ^ set(new_acc_list)
    if len(diff) == 0:
        printError("New account not found on the node", _ret[1])
    else:
        account = list(diff)[0]
        retval = checkOutput(_ret[1], [account])

    return retval
Пример #33
0
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path):
    '''
    keyed by probe ID w/ gene as value
    '''
    #see if it already exists
    pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder)
    if utils.checkOutput(pickle_path, 0, 0):
        print('loading previously made probe dict at %s' % (pickle_path))
        probe_gene_dict = pickle.load(open(pickle_path, "rb"))
        return probe_gene_dict

    #we want to intersect refseq common names w/ the array
    startDict = utils.makeStartDict(annotFile)

    ref_name_list = utils.uniquify(
        [startDict[refID]['name'] for refID in startDict.keys()])
    probe_gene_dict = {}

    array_1 = utils.parseTable(array_1_path, '\t')
    array_2 = utils.parseTable(array_2_path, '\t')
    ticker = 0
    for line in array_1 + array_2:
        if len(line) < 5:
            continue
        ticker += 1
        probe_id = line[4]
        name = line[-1]
        # print(probe_id)
        # print(name)
        # if ticker== 10:
        #     sys.exit()
        # print(line)

        if ref_name_list.count(name) > 0:
            probe_gene_dict[probe_id] = name

    pickle.dump(probe_gene_dict, open(pickle_path, 'wb'))
    return probe_gene_dict
Пример #34
0
def SetupSetFeeTwoOptions(_actor, _fee, _failed=False):
    global currentBatFee, currentRegFee

    _cmd = [SetupExe, '--setfee', str(_fee)]
    _ret = runCmdWithActor(_cmd, _actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long")
        return retval

    if (len(_ret[1]) == 0):
        printError("No output")
        return retval
    else:
        newBatFee = w3.toWei(_fee, 'ether')

        try:
            currentBatFeeWei = mgmtContract.functions.batteryFee().call()
        except ValueError as error:
            printError("Cannot call batteryFee()", [error])
            return retval

        currentBatFee = w3.fromWei(currentBatFeeWei, 'ether')
        currentRegFee = Decimal.normalize(1000 * currentBatFee)

        if newBatFee != currentBatFeeWei:
            printError("Fee was not set", _ret[1])
            return retval

        if not _failed:
            forma = ['Updated successfully']
        else:
            forma = ['No permissions to change the service fee']

        return checkOutput(_ret[1], forma)
Пример #35
0
def VendorRegisterInsufficientFundsFail(_name, _value):
    walletContractAddress = mgmtContract.functions.walletContract().call()
    prev_balance = w3.eth.getBalance(walletContractAddress)

    password = "******"
    actor = {"account": w3.personal.newAccount(password), "password": password}

    cmd = [VendorExe, '--reg', _name, str(_value)]
    _ret = runCmdWithActor(cmd, actor)

    retval = False

    if not _ret[0]:
        print("The command executed too long")
        return retval

    cur_balance = w3.eth.getBalance(walletContractAddress)

    if not (cur_balance - prev_balance == 0):
        printError("Wallet balance was changed", _ret[1])
        return retval

    try:
        deposit = mgmtContract.functions.vendorDeposit(actor['account']).call()
    except ValueError as error:
        printError("Cannot call vendorDeposit()", [error])
        return retval

    if deposit != 0:
        print(deposit)
        printError("Vendor was registered. Deposit is not equal to zero",
                   _ret[1])
        return retval

    retval = checkOutput(_ret[1], ['Failed. No enough funds to deposit.'])

    return retval
Пример #36
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument("-b", "--bam", dest="bam", nargs='*',
                        help="Enter a comma separated list of .bam files to be processed.", required=True)
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True)
    parser.add_argument("-g", "--genome", dest="genome", type=str,
                        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True)

    # output flag
    parser.add_argument("-o", "--output", dest="output", type=str,
                        help="Enter the output folder.", required=True)
    # additional options
    parser.add_argument("--stretch-input", dest="stretch_input", default=None, type=int,
                        help="Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)")
    parser.add_argument("-c", "--color", dest="color", default=None,
                        help="Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow")
    parser.add_argument("-s", "--sense", dest="sense", default='both',
                        help="Map to '+','-' or 'both' strands. Default maps to both.")
    parser.add_argument("-e", "--extension", dest="extension", default=200,
                        help="Extends reads by n bp. Default value is 200bp")
    parser.add_argument("-r", "--rpm", dest="rpm", action='store_true', default=False,
                        help="Normalizes density to reads per million (rpm) Default is False")
    parser.add_argument("-y", "--yScale", dest="yScale", default="relative",
                        help="Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling")
    parser.add_argument("-n", "--names", dest="names", default=None,
                        help="Enter a comma separated list of names for your bams")
    parser.add_argument("-p", "--plot", dest="plot", default="MULTIPLE",
                        help="Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'")
    parser.add_argument("-t", "--title", dest="title", default='',
                        help="Specify a title for the output plot(s), default will be the coordinate region")

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument("--scale", dest="scale", default='',
                        help="Enter a comma separated list of scaling factors for your bams. Default is none")
    parser.add_argument("--save-temp", dest="save", action='store_true', default=False,
                        help="If flagged will save temporary files made by bamPlot")
    parser.add_argument("--bed", dest="bed",
                        help="Add a space-delimited list of bed files to plot")
    parser.add_argument("--multi-page", dest="multi", action='store_true', default=False,
                        help="If flagged will create a new pdf for each region")

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bamFileList = args.bam
        if len(args.bam) == 1:
            bamFileList = args.bam[0].split(',')

        # Make sure these are actually files & readable (!)
        for filename in bamFileList:
            assert(os.access(filename, os.R_OK))

        # bringing in any beds
        if args.bed:
            bedFileList = args.bed
            if type(bedFileList) == str:
                bedFileList = args.bed.split(',')
            print(bedFileList)
            bedCollection = makeBedCollection(bedFileList)
        else:
            bedCollection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {'+', '-', '.'}
        if os.access(args.input, os.R_OK):
            if args.input.endswith('.bed'):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parseTable(args.input, '\t')
                gffName = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [[e[0], '', args.input, e[1], e[2], '', e[5], '', ''] for e in parsed_input_bed]
                except IndexError:
                    pass

                if gff is None:
                    print("Your bed doesn't have a valid sense parameter. Defaulting to both strands, '.'")
                    # We only take chr/start/stop and ignore everything else.
                    gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', ''] for e in parsed_input_bed]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parseTable(args.input, '\t')
                gffName = args.input.split('/')[-1].split('.')[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chromLine = args.input.split(':')
            try:
                chrom = chromLine[0]
                sense = chromLine[1]
            except IndexError:
                print('Invalid input line or inaccessible file. Try: chr1:.:1-5000')
                exit()
            assert(sense in valid_sense_options)
            [start, end] = chromLine[2].split('-')
            if chrom[0:3] != 'chr':
                print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT')
                exit()
            gffLine = [chrom, '', args.input, start, end, '', sense, '', '']
            gffName = "%s_%s_%s_%s" % (chrom, sense, start, end)
            gff = [gffLine]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print('Stretching inputs to a minimum of: %d bp' % (args.stretch_input))
            minLength = args.stretch_input
            stretchGff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < minLength:
                    pad = int((minLength - difference) / 2)
                    stretchGff.append([e[0], e[1], e[2], int(e[3])-pad, int(e[4])+pad, e[5], e[6], e[7], e[8]])
                else:
                    stretchGff.append(e)

            gff = stretchGff

        # Sanity test the gff object
        assert(all([e[6] in valid_sense_options for e in gff]))  # All strands are sane
        #assert(all([int(e[3]) < int(e[4]) for e in gff]))  # All start/stops are ordered

        # bring in the genome
        genome = args.genome.upper()
        if ['HG18', 'HG19', 'HG19_RIBO','HG38','MM9', 'MM10', 'RN4','RN6'].count(genome) == 0:
            print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10' % (genome))
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        rootFolder = args.output
        if rootFolder[-1] != '/':
            rootFolder += '/'
        try:
            os.listdir(rootFolder)
        except OSError:
            print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder))
            exit()

        # Get analysis title
        if len(args.title) == 0:
            title = gffName
        else:
            title = args.title

        # make a temp folder
        tempFolder = rootFolder + title + '/'
        print("CREATING TEMP FOLDER %s" % (tempFolder))
        pipeline_dfci.formatFolder(tempFolder, create=True)

        # colors
        if args.color:
            colorList = args.color.split(':')
            colorList = [x.split(',') for x in colorList]
            if len(colorList) < len(bamFileList):
                print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED')
                # recycling the color list
                colorList += colorList * (len(bamFileList) / len(colorList))
                colorList = colorList[0:len(bamFileList)]

        else:
            # cycles through the colors of the rainbow
            colorList = tasteTheRainbow(len(bamFileList))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale

        yScale = args.yScale.upper()

        # names
        if args.names:
            names = args.names.split(',')

            if len(names) != len(bamFileList):
                print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND')
                parser.print_help()
                exit()
        else:
            names = [x.split('/')[-1] for x in bamFileList]

        # plot style
        plotStyle = args.plot.upper()
        if ['SINGLE', 'MULTIPLE','MERGE'].count(plotStyle) == 0:
            print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle))
            parser.print_help()
            exit()

        # now run!
        summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection,scale)
        print ("%s is the summary table" % (summaryTableFileName))

        #running the R command to plot
        multi = args.multi
        outFile = "%s%s_plots.pdf" % (rootFolder, title)
        rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle,multi)

        # open a bash file
        bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title)
        bashFile = open(bashFileName, 'w')
        bashFile.write('#!/usr/bin/bash\n')
        bashFile.write(rCmd)
        bashFile.close()
        print("Wrote R command to %s" % (bashFileName))
        os.system("bash %s" % (bashFileName))

        # delete temp files
        if not args.save:
            if utils.checkOutput(outFile, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert(" " not in tempFolder)
                assert(tempFolder is not "/")
                removeCommand = "rm -rf %s" % (tempFolder)
                print(removeCommand)
                os.system(removeCommand)
            else:
                print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile))

    else:
        parser.print_help()
        sys.exit()
Пример #37
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] {-r [ROSE_FOLDERS] | -i [INPUT_GFF]} -o [OUTPUT_FOLDER] --group1 [GROUP1_NAMES] --group2 [GROUP2_NAMES] --name1 [GROUP1_NAME] --name2 [GROUP2_NAME]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9,RN4) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")
    parser.add_option("--group1", dest="group1",nargs = 1, default=None,
                      help = "Enter a comma separated list of dataset names associated with the first group")
    parser.add_option("--group2", dest="group2",nargs = 1, default=None,
                      help = "Enter a comma separated list of dataset names associated with the second group")
    parser.add_option("--name1", dest="name1",nargs = 1, default=None,
                      help = "Enter a name for the first group of datasets")
    parser.add_option("--name2", dest="name2",nargs = 1, default=None,
                      help = "Enter a name for the second group of datasets")

    #the input options
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of meta rose folders")

    #optional input to supercede the meta rose (this is kinda sad but will fix later)
    #should have had this code run clustering from the get go
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "enter a gff, bed or table of regions to perform dyanmic analysis on")




    #additional options
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m","--median", dest="median",action = 'store_true', default=False,
                      help = "If flagged, will use median enhancer scaling")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")
    parser.add_option("--use-background", dest="background",action = 'store_true',default=False,
                      help = "If flagged will use background datasets as in data table")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    
    requiredArgs = [options.genome,options.data,options.rose,options.output,options.group1,options.group2,options.name1,options.name2]
    

    try:
        assert(all(requiredArgs))
    except AssertionError:
        parser.print_help()
        sys.exit()

    #now the main run of the function

    #getting the genoe and data file
    genome = string.upper(options.genome)
    dataFile = options.data

    #getting the rose folders
    roseFolderString = options.rose
    [roseFolder1,roseFolder2] = roseFolderString.split(',')
    parentFolder = utils.formatFolder(options.output,True)

    #getting the analysis names
    name1 = options.name1
    name2 = options.name2
    mergeName = "%s_%s_merged" % (name1,name2)

    #getting the datasets names associated with each group
    namesList1 = options.group1.split(',')
    namesList2 = options.group2.split(',')

    #options for background corection
    useBackground = options.background

    #option for median scaling
    medianScale = options.median

    #option for an overriding set of input regions
    if options.input != None:
        #for now only works w/ gffs
        print('Using %s as a set of predifined input regions' % (options.input))
        inputGFF = options.input
    else:
        inputGFF= ''
    

    plotBam = options.plot
    if options.all:
        superOnly = False
    else:
        superOnly = True

    if superOnly and plotBam:
        print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
    if superOnly and not plotBam:
        print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
    if not superOnly and plotBam:
        print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
    if not superOnly and not plotBam:
        print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

    #part 1
    print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
    #start with the all enhancer tables from the initial rose calls

    roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
    roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)

    roseDict1 = makeRoseDict(roseFolder1)
    roseDict2 = makeRoseDict(roseFolder2)

    #choosing the type of enhancer to analyze
    enhancerCallType = string.lower(options.enhancer_type)
    if superOnly:
        print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType)))
    superFile1 = roseDict1[enhancerCallType]
    superFile2 = roseDict2[enhancerCallType]

    allFile1 = roseDict1['AllEnhancer']
    allFile2 = roseDict2['AllEnhancer']
    
    regionFile1 = roseDict1['RegionMap']
    regionFile2 = roseDict1['RegionMap']

    #this is where we can toggle either using meta rose or clustering
    print('\tMERGING ENHANCERS AND CALLING ROSE')
    if superOnly:
        if len(superFile1) ==0:
            print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1)
            sys.exit()
        if len(superFile2) == 0:
            print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2)
            sys.exit()
        roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF)

    else:
        print('doing it right')
        print(allFile1)
        print(allFile2)

        roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF)
        print('this is rose output')
        print(roseOutput)
    print('\tMERGING ROSE OUTPUT')

    mergedRoseOutput,normRoseOutput = mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale)
    


    print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

    #part2 is the R script
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    
    rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale,namesList1)
    print(rcmd) 
    os.system(rcmd)

    time.sleep(5)
    callRoseGeneMapper(mergedGFFFile,genome,parentFolder,namesList1)

    #rank the genes


    #part 3
    #rank the delta
    print "PART 3: assinging ranks to differential enhancers"
    print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

    gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName)
    enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,namesList1[0],gffName)
    if utils.checkOutput(enhancerToGeneFile):
        rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,namesList1[0],gffName)
        assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
    else:
        print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
        sys.exit()

    #make the rank plot
    print('MAKING RANK PLOTS')
    if utils.checkOutput(rankOutput):
        print('checking for rank output %s' % (rankOutput))
        rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
        print(rcmd)
        os.system(rcmd)
    else:
        print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
        sys.exit()

    print('MAKING REGION SIGNAL PLOTS AND FINDING DIFFERENTIAL REGIONS')
    if utils.checkOutput(normRoseOutput):
        print('checking for %s' % (normRoseOutput))
        rcmd = callRegionPlotRScript(normRoseOutput,name1,name2,namesList1,namesList2)
        print(rcmd)
        os.system(rcmd)
    else:
        print('ERROR: REGION PLOT SCRIPT FAILED TO RUN')
        sys.exit()

    #NOW MAP GENES
    print('mapping genes to differential enhancers')
    statOutput,diffOutput = callRoseGeneMapper_stats(mergedGFFFile,genome,parentFolder,namesList1)



    if utils.checkOutput(statOutput):
        print('checking for gene mapping output %s' % (statOutput))
        print('FINISHED WITH GENE MAPPING')
    else:
        print('GENE MAPPING FAILED')
        sys.exit()

    print('FINISHING OUTPUT')
    
    finishRankOutput(dataFile,statOutput,diffOutput,genome,parentFolder,mergeName,name1,name2,namesList1,namesList2,1.0,100000,superOnly,plotBam)
Пример #38
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #39
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Пример #40
0
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile=''):

    '''
    launches enhancer mapping if needed from enriched region files
    '''

    namesList = nameDict.keys()

    #check to see if everything is good, if so return True and call it a day
    if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList):
        print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS"
        return nameDict

    #if not, have to call rose
    
    roseOutputFolder = utils.formatFolder(roseFolder,True)
    
    queueList =[]
    for name in namesList:

        #check to see if we need to call rose
        if nameDict[name]['enhancerFile'] == '':
     
            #get the enriched file
            enrichedFile = nameDict[name]['enrichedFile']
            #call rose
            print "CALLING ROSE FOR %s" % (name)
            bashFileName = pipeline_dfci.callRose2(dataFile,'',roseOutputFolder,[name],[],enrichedFile,tssDistance,stitch,mask=maskFile)
            print bashFileName
            os.system('bash %s &' % (bashFileName))
            #add name to queue list
            queueList.append(name)



    #define the enhancer type
    if enhancerType == 'super':
        enhancerString = 'AllEnhancers.table.txt'
    if enhancerType == 'stretch':
        enhancerString = 'AllEnhancers_Length.table.txt'
    if enhancerType == 'superstretch':
        enhancerString = 'AllEnhancers_SuperStretch.table.txt'



    #now check for completion of datasets
    for name in queueList:

        #check for the AllEnhancers table        
        enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder,name,name,enhancerString)
        

        print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile)
        if utils.checkOutput(enhancerFile,1,10):
            
            print "FOUND ENHANCER OUTPUT FOR %s" % (name)
            nameDict[name]['enhancerFile'] = enhancerFile
        else:

            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (roseOutputFolder,name)
            roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()
            enhancerFile = getFile(enhancerString,roseFileList,roseFolder)
            nameDict[name]['enhancerFile'] = enhancerFile

    return nameDict
Пример #41
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of rose folder")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")
    parser.add_option("-n","--names", dest="names",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to go with the datasets")


    #additional options
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m","--median", dest="median",action = 'store_true', default=False,
                      help = "If flagged, will use median enhancer scaling")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.genome and options.data and options.rose and options.output and options.names:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1,roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output,True)
        

        nameString = options.names
        [name1,name2] =nameString.split(',')

        mergeName = "%s_%s_merged" % (name1,name2)

        #option for median scaling
        medianScale = options.median

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
        #start with the all enhancer tables from the initial rose calls

        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)

        roseDict1 = makeRoseDict(roseFolder1)
        roseDict2 = makeRoseDict(roseFolder2)


        #choosing the type of enhancer to analyze
        enhancerCallType = string.lower(options.enhancer_type)
        if superOnly:
            print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType)))
        superFile1 = roseDict1[enhancerCallType]
        superFile2 = roseDict2[enhancerCallType]

        allFile1 = roseDict1['AllEnhancer']
        allFile2 = roseDict2['AllEnhancer']

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            if len(superFile1) ==0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1)
                sys.exit()
            if len(superFile2) == 0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2)
                sys.exit()
            roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder)

        else:

            roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder)



        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

        #part2 is the R script
        mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    
        rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale)
        print(rcmd) 
        os.system(rcmd)

        time.sleep(30)
        callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1)

        #rank the genes


        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

        gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName)
            assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam)
    else:
        parser.print_help()
        sys.exit()
Пример #42
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of rose folder")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")

    #additional options
    parser.add_option("-n","--names", dest="names",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to go with the datasets")
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.genome and options.data and options.rose and options.output:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1,roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output,True)
        
        if options.names:
            nameString = options.names
            [name1,name2] =nameString.split(',')
        else:
            name1 = roseFolder1.split('/')[-1]
            name1 = string.replace(name1,'_ROSE','')

            name2 = roseFolder2.split('/')[-1]
            name2 = string.replace(name2,'_ROSE','')

        mergeName = "%s_%s_merged" % (name1,name2)

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
        #start with the all enhancer tables from the initial rose calls
        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)
        superFile1 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder1,name1)
        superFile2 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder2,name2)

        allFile1 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder1,name1)
        allFile2 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder2,name2)

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            mergedGFFFile = '%s%s_%s_MERGED_SUPERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)
            #callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergedGFFFile,parentFolder)

        else:
            mergedGFFFile = '%s%s_%s_MERGED_ENHANCERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)
            #callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergedGFFFile,parentFolder)


        if superOnly:
            superOutput = "%s%s_ROSE/%s_%s_MERGED_SUPERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)
        else:
            superOutput = "%s%s_ROSE/%s_%s_MERGED_ENHANCERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)

        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')
        if utils.checkOutput(superOutput):
            #part2 is the R script
            rcmd = callDeltaRScript(mergedGFFFile,parentFolder,name1,name2)
            print(rcmd) 
            os.system(rcmd)
            time.sleep(30)
            callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1)
        else:
            print('ERROR: ROSE CALL FAILED')
            sys.exit()

        #rank the genes


        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')
        if superOnly:
            gffName = '%s_%s_MERGED_SUPERS_-0_+0' % (string.upper(genome),mergeName)
        else:
            gffName = '%s_%s_MERGED_ENHANCERS_-0_+0' % (string.upper(genome),mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName)
            assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam)
    else:
        parser.print_help()
        exit()
Пример #43
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)

    outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    outputRank = outputRank.communicate()
    if len(outputRank[0]) > 0:  # test if mapping worked correctly
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.gff" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        outputControl = outputControl.communicate()
        if len(outputControl[0]) > 0:  # test if mapping worked correctly
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Пример #44
0
def main():
    '''
    main run method for enhancer promoter contribution tool
    '''

    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs='*',
        help="Enter a space separated list of .bam files for the main factor",
        required=True)
    parser.add_argument("-i",
                        "--input",
                        dest="input",
                        type=str,
                        help="Enter .gff or .bed file of regions to analyze",
                        required=True)
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=
        "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported",
        required=True)

    # output flag
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        type=str,
                        help="Enter the output folder.",
                        required=True)

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=
        "specify a table where first column represents a list of active refseq genes",
        required=False)

    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs='*',
        help=
        "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction",
        required=False)
    parser.add_argument(
        "-w",
        "--window",
        dest="window",
        type=int,
        help=
        "Enter a window to define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000)
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs='*',
        help="enter a space separated list of other bams to map to",
        required=False)

    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        "enter a root name for the analysis, otherwise will try to find the name from the input file",
        required=False)

    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        "Run the analysis on the top N genes by total signal. Default is 5000",
        required=False,
        default=5000)
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        "Include a .bed of tad regions to restrict enhancer/gene association",
        required=False,
        default=None)

    args = parser.parse_args()

    print(args)

    #minimum arguments needed to proceed
    if args.bam and args.input and args.genome and args.output:

        #=====================================================================================
        #===============================I. PARSING ARGUMENTS==================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n'
        )

        #top analysis subset
        top = args.top

        #input genome
        genome = args.genome.upper()
        print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))

        #set of bams
        bamFileList = args.bam

        #bring in the input path
        inputPath = args.input

        #try to get the input name or use the name argument
        if args.name:
            analysisName = args.name
        else:
            analysisName = inputPath.split('/')[-1].split('.')[0]

        print('USING %s AS ANALYSIS NAME' % (analysisName))
        #setting up the output folder
        parentFolder = utils.formatFolder(args.output, True)
        outputFolder = utils.formatFolder(
            '%s%s' % (parentFolder, analysisName), True)

        print('WRITING OUTPUT TO %s' % (outputFolder))

        if inputPath.split('.')[-1] == 'bed':
            #type is bed
            print('input in bed format, converting to gff')
            inputGFF = utils.bedToGFF(inputPath)
        else:
            inputGFF = utils.parseTable(inputPath, '\t')

        #the tss window
        window = int(args.window)

        #activity path
        if args.activity:
            activityPath = args.activity
            activityTable = utils.parseTable(activityPath, '\t')

            #try to find the column for refseq id
            for i in range(len(activityTable[0])):
                if str(activityTable[0][i]).count('NM_') > 0 or str(
                        activityTable[0][i]).count('NR_') > 0:
                    ref_col = i

            geneList = [line[ref_col] for line in activityTable
                        ]  # this needs to be REFSEQ NM ID
            print('IDENTIFIED %s ACTIVE GENES' % (len(geneList)))

        else:
            geneList = []

        #check if tads are being invoked
        if args.tads:
            print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
            use_tads = True
            tads_path = args.tads
        else:
            use_tads = False
            tads_path = ''

        print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))

        #important here to define the window
        startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile(
            genome, window, geneList, True)
        #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
        #sys.exit()

        print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES')

        print(chrom_list)
        filtered_gff = [
            line for line in inputGFF if chrom_list.count(line[0]) > 0
        ]

        print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' %
              (len(filtered_gff), len(inputGFF)))

        #=====================================================================================
        #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
        #=====================================================================================

        print(
            '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n'
        )

        #now we need to split the input region
        print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window))
        splitGFF = splitRegions(filtered_gff, tssCollection)
        print(len(filtered_gff))
        print(len(splitGFF))

        splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName)
        utils.unParseTable(splitGFF, splitGFFPath, '\t')
        print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath))

        #now you have to map the bams to the gff
        print('MAPPING TO THE SPLIT GFF')
        mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),
                                          True)

        signalTable = mapBams(bamFileList, splitGFFPath, analysisName,
                              mappedFolder)
        signalTablePath = '%s%s_signal_table.txt' % (outputFolder,
                                                     analysisName)
        utils.unParseTable(signalTable, signalTablePath, '\t')

        if args.control:
            controlBamFileList = args.control
            controlSignalTable = mapBams(controlBamFileList, splitGFFPath,
                                         analysisName, mappedFolder)
            controlSignalTablePath = '%s%s_control_signal_table.txt' % (
                outputFolder, analysisName)
            utils.unParseTable(controlSignalTable, controlSignalTablePath,
                               '\t')

        #now create the background subtracted summarized average table

        print('CREATING AN AVERAGE SIGNAL TABLE')
        averageTable = makeAverageTable(outputFolder,
                                        analysisName,
                                        useBackground=args.control)
        averageTablePath = '%s%s_average_table.txt' % (outputFolder,
                                                       analysisName)
        utils.unParseTable(averageTable, averageTablePath, '\t')

        #now load up all of the cpg and other parameters to make the actual peak table

        #first check if this has already been done
        peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName)
        if utils.checkOutput(peakTablePath, 0.1, 0.1):
            print('PEAK TABLE OUTPUT ALREADY EXISTS')
            peakTable = utils.parseTable(peakTablePath, '\t')
        else:
            peakTable = makePeakTable(paramDict, splitGFFPath,
                                      averageTablePath, startDict, geneList,
                                      genomeDirectory, tads_path)
            utils.unParseTable(peakTable, peakTablePath, '\t')

        geneTable = makeGeneTable(peakTable, analysisName)

        geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName)
        utils.unParseTable(geneTable, geneTablePath, '\t')

        #if mouse, need to convert genes over
        if genome.count('MM') == 1:
            print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA')
            converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % (
                outputFolder, analysisName)

            converted_geneTable = [geneTable[0]]
            for line in geneTable[1:]:
                converted_name = mouse_convert_dict[line[0]]
                if len(converted_name) > 0:
                    converted_geneTable.append([converted_name] + line[1:])

                    utils.unParseTable(converted_geneTable,
                                       converted_geneTablePath, '\t')

            geneTablePath = converted_geneTablePath
            geneTable = converted_geneTable

        #=====================================================================================
        #===================================III. PLOTTING ====================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n'
        )

        #if there are fewer genes in the gene table than the top genes, only run on all
        if len(geneTable) < int(top):
            print(
                'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s'
                % (len(geneTable) - 1, top))
            top = 0
            use_top = False
        else:
            use_top = True

        #now call the R code
        print('CALLING R PLOTTING SCRIPTS')
        callRWaterfall(geneTablePath, outputFolder, analysisName, top)

        #=====================================================================================
        #==================================IV. RUNNING GSEA===================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n'
        )

        #now let's call gsea
        print('RUNNING GSEA ON C2')
        callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter',
                 use_top)
        callGSEA(outputFolder, analysisName, top, 'total_contribution',
                 use_top)

        if use_top:
            print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
            #for top by enhancer v promoter metric
            top_promoterTablePath, top_distalTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'enhancer_vs_promoter')
            top_signalTablePath, top_backgroundTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'total_contribution')

            print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
            callR_GSEA(top_promoterTablePath, top_distalTablePath,
                       outputFolder, analysisName + '_enhancer_vs_promoter',
                       top)
            callR_GSEA(top_signalTablePath, top_backgroundTablePath,
                       outputFolder, analysisName + '_total_contribution', top)

        print('DETECTING GSEA OUTPUT FOR ALL GENES')
        #for top
        all_promoterTablePath, all_distalTablePath = detectGSEAOutput(
            analysisName, outputFolder, 'all')

        print('MAKING NES PLOTS FOR ALL GENES')
        callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder,
                   analysisName, 'all')

        #these files can be parsed to make the NES plot

        #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1]
        print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
Пример #45
0
def main():
    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage=usage)

    # required flags
    parser.add_option("-b", "--bam", dest="bam", nargs=1, default=None,
                      help="Enter a comma separated list of .bam files to be processed.")
    parser.add_option("-i", "--input", dest="input", nargs=1, default=None,
                      help="Enter .gff or genomic region e.g. chr1:+:1-1000.")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="specify a genome, HG18,HG19,MM8,MM9 are currently supported")

    # output flag
    parser.add_option("-o", "--output", dest="output", nargs=1, default=None,
                      help="Enter the output folder.")
    # additional options
    parser.add_option("-c", "--color", dest="color", nargs=1, default=None,
                      help="Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow")
    parser.add_option("-s", "--sense", dest="sense", nargs=1, default='both',
                      help="Map to '+','-' or 'both' strands. Default maps to both.")
    parser.add_option("-e", "--extension", dest="extension", nargs=1, default=200,
                      help="Extends reads by n bp. Default value is 200bp")
    parser.add_option("-r", "--rpm", dest="rpm", action='store_true', default=False,
                      help="Normalizes density to reads per million (rpm) Default is False")
    parser.add_option("-y", "--yScale", dest="yScale", nargs=1, default="relative",
                      help="Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling")
    parser.add_option("-n", "--names", dest="names", nargs=1, default=None,
                      help="Enter a comma separated list of names for your bams")
    parser.add_option("-p", "--plot", dest="plot", nargs=1, default="MULTIPLE",
                      help="Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE'")
    parser.add_option("-t", "--title", dest="title", nargs=1, default='',
                      help="Specify a title for the output plot(s), default will be the coordinate region")

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_option("--save-temp", dest="save", action='store_true', default=False,
                      help="If flagged will save temporary files made by bamPlot")
    parser.add_option("--bed", dest="bed", nargs=1, default=None,
                      help="Add a comma separated list of bam files to plot")

    (options, args) = parser.parse_args()

    print(options)
    print(args)

    if options.bam and options.input and options.genome and options.output:

        # bring in the bams
        bamFileList = options.bam.split(',')

        # bringing in any beds
        if options.bed:

            bedFileList = options.bed.split(',')
            bedCollection = makeBedCollection(bedFileList)
        else:
            bedCollection = utils.LocusCollection([], 50)

        # bring in the gff
        try:
            gff = utils.parseTable(options.input, '\t')
            gffName = options.input.split('/')[-1].split('.')[0]
        except IOError:
            # means a coordinate line has been given e.g. chr1:+:1-100

            chromLine = options.input.split(':')

            chrom = chromLine[0]
            sense = chromLine[1]
            [start, end] = chromLine[2].split('-')
            if chrom[0:3] != 'chr':
                print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT')
                exit()
            gffLine = [chrom, '', options.input, start, end, '', sense, '', '']
            gffName = "%s_%s_%s_%s" % (chrom, sense, start, end)
            gff = [gffLine]

        # bring in the genome
        genome = options.genome.upper()
        if ['HG18', 'HG19', 'MM9', 'RN5'].count(genome) == 0:
            print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN5, OR MM9' % (genome))
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        rootFolder = options.output
        if rootFolder[-1] != '/':
            rootFolder += '/'
        try:
            os.listdir(rootFolder)
        except OSError:
            print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder))
            exit()

        # Get analysis title
        if len(options.title) == 0:
            title = gffName
        else:
            title = options.title

        # make a temp folder
        tempFolder = rootFolder + title + '/'
        print("CREATING TEMP FOLDER %s" % (tempFolder))
        pipeline_dfci.formatFolder(tempFolder, create=True)

        # colors
        if options.color:
            colorList = options.color.split(':')
            colorList = [x.split(',') for x in colorList]
            if len(colorList) < len(bamFileList):
                print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED')
                # recycling the color list
                colorList += colorList * (len(bamFileList) / len(colorList))
                colorList = colorList[0:len(bamFileList)]

        else:
            # cycles through the colors of the rainbow
            colorList = tasteTheRainbow(len(bamFileList))

        # sense
        sense = options.sense

        extension = int(options.extension)

        rpm = options.rpm

        yScale = options.yScale.upper()

        # names
        if options.names:
            names = options.names.split(',')

            if len(names) != len(bamFileList):
                print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND')
                parser.print_help()
                exit()
        else:
            names = [x.split('/')[-1] for x in bamFileList]

        # plot style
        plotStyle = options.plot.upper()
        if ['SINGLE', 'MULTIPLE'].count(plotStyle) == 0:
            print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle))
            parser.print_help()
            exit()

        # now run!
        summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection)
        print ("%s is the summary table" % (summaryTableFileName))

        outFile = "%s%s_plots.pdf" % (rootFolder, title)
        rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle)

        # open a bash file to get shit done
        bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title)
        bashFile = open(bashFileName, 'w')
        bashFile.write('#!/usr/bin/bash\n')
        bashFile.write(rCmd)
        bashFile.close()
        print("Wrote R command to %s" % (bashFileName))
        os.system("bash %s" % (bashFileName))

        # delete temp files
        if not options.save:
            if utils.checkOutput(outFile, 1, 10):
                removeCommand = "rm -rf %s" % (tempFolder)
                print(removeCommand)
                os.system(removeCommand)
            else:
                print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile))

    else:
        parser.print_help()
        sys.exit()
Пример #46
0
def main():


    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -t [TEST_BAM] -c [CONTROL_BAM] -g [GENOME]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-t","--test", dest="test",nargs = 1, default=None,
                      help = "Enter the full path of the test bam")
    parser.add_option("-c","--control", dest="control",nargs = 1, default=None,
                      help = "Enter the full path of the control bam")
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the build for the GeCKO library (currently only supports geckov2)")


    #optional arguments
    parser.add_option("-n","--name",dest="name",nargs =1, default = 0,
                      help = "Comma separated test,control name")
    parser.add_option("-s","--scoring",dest="scoring",nargs =1, default = 'WtSum',
                      help = "Scoring method (KSbyScore,WtSum,SecondBestRank) defulat: WtSum")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the full path of the output folder. Default is the current working directory")


    (options,args) = parser.parse_args()

    #three required parameters to get started
    if options.test and options.control and options.genome:

        #get the names of the datasets
        if options.name:
            if len(options.name.split(',')) == 2:
                [testName,controlName] = options.name.split(',')
            else:
                print("ERROR: Must provide a comma separated test,control name if using -n flag")
                parser.print_help()
                sys.exit()
        else:
            #try to extract names from file
            #strip extension from filename
            testName = options.test.split('/')[-1].split('.')[0]
            controlName = options.control.split('/')[-1].split('.')[0]

        #names
        print("using %s as name for test dataset" % (testName))
        print("using %s as name for control dataset" % (controlName))

        #get the analysis name
        analysisName = '%s_%s' % (testName,controlName)
        print("using %s as analysis name" % (analysisName))
        
        #get the scoring method
        scoringMethod = options.scoring
        if ['KSbyScore','WtSum','SecondBestRank'].count(scoringMethod)==0:
            print("ERROR: please specify one of the following scoring methods:('KSbyScore','WtSum','SecondBestRank') or leave blank (default WtSum)")
            parser.print_help()
            sys.exit()
                  
        
        #set up output folder
        if options.output:
            outputFolder = utils.formatFolder(options.output,True)
        else:
            outputFolder = utils.formatFolder('./%s/' % (analysisName),True)

        print("using %s as an output folder" % (outputFolder))

        #get the right annotation
        genomeDict = {'geckov2':'/grail/genomes/gecko/GeCKOv2/Annotation/Human_GeCKOv2_Library.txt',
                      }

        #load the annotation dictionary
        annotFile = genomeDict[string.lower(options.genome)]
        print("using %s as the annotation file" % (annotFile))
        
        #guideDict,geneDict = makeAnnotDict(annotFile)
        
        #now set up each bam
        testBam = utils.Bam(options.test)
        controlBam = utils.Bam(options.control)

        #get the MMR for each
        testMMR = round(float(testBam.getTotalReads())/1000000,4)
        controlMMR = round(float(controlBam.getTotalReads())/1000000,4)

        print("Test dataset: %s has an MMR of %s" % (testName,testMMR))
        print("Control dataset: %s has an MMR of %s" % (controlName,controlMMR))

        #now get the idxstats output
        testIdxFile = '%s%s_idxstats.txt' % (outputFolder,testName)
        testIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.test,testIdxFile)
        print("Test idxstats command:")
        print(testIdxCmd)
        os.system(testIdxCmd)

        controlIdxFile = '%s%s_idxstats.txt' % (outputFolder,controlName)
        controlIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.control,controlIdxFile)
        print("Control idxstats command:")
        print(controlIdxCmd)
        os.system(controlIdxCmd)

        print("Checking for output")
        if not utils.checkOutput(testIdxFile,0.1,5):
            print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.test))
        print("Found test IdxStats file")
        if not utils.checkOutput(controlIdxFile,0.1,5):
            print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.control))
        print("Found control IdxStats file")

        #now make the fold table

        foldTableFile =makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1)
        
        print('writing output to %s' % (foldTableFile))
        
        print("MAING FRIGER TABLE")
        rigerTableFile = makeRigerTable(foldTableFile,output='')
        print('writing FRIGER table to %s' % (rigerTableFile))

        rigerBashFileName = callRiger(rigerTableFile,scoring=scoringMethod,output='',callRiger=True)

        

    else:
        parser.print_help()
        sys.exit()
Пример #47
0
def main():

    '''
    using argparse

    '''

    parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES')

    # required flags
    parser.add_argument("-d", "--data_table", dest="data_table", type=str,
                      help="input a data table with all datasets to be analyzed", required=True)
    parser.add_argument("-1", "--group1", dest="group1", type=str,
                      help="input a comma separated list of all datasets in group1", required=True)
    parser.add_argument("-2", "--group2", dest="group2", type=str,
                      help="input a comma separated list of all datasets in group2", required=True)


    #optional input override
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="input a gff of regions to analyze", required=False)


    #optional arguments
    parser.add_argument("-n", "--name", dest="name", type=str,
                      help="specify a name for the analysis. Default is drawn from the data table name", required=False)

    parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str,
                      help="Enter a name for group1.  Default is 'GROUP1'", required=False)
    parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str,
                      help="Enter a name for group2.  Default is 'GROUP2'", required=False)

    parser.add_argument("-a", "--activity", dest="activity", type=str,default='',
                      help="a table with active gene names in the first column", required=False)
    parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500,
                      help="Specify a TSS exclusion distance. Default is 2500", required=False)
    parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None,
                      help="Specify a stitching distance. Default is auto stitching", required=False)



    parser.add_argument("-o", "--output", dest="output", default='./',type=str,
                      help="Enter the output folder. Default is the current working directory", required=False)

    parser.add_argument("--log", dest="log", default='',type=str,
                      help="Enter a path to log output", required=False)



#     # DEBUG OPTION TO SAVE TEMP FILES
#     parser.add_argument("--scale", dest="scale", default='',
#                       help="Enter a comma separated list of scaling factors for your bams. Default is none")
#     parser.add_argument("--save-temp", dest="save", action='store_true', default=False,
#                       help="If flagged will save temporary files made by bamPlot")
#     parser.add_argument("--bed", dest="bed",
#                       help="Add a space-delimited list of bed files to plot")
#     parser.add_argument("--multi-page", dest="multi", action='store_true', default=False,
#                       help="If flagged will create a new pdf for each region")

    args = parser.parse_args()



    #now we can begin to parse the arguments
    
    #=====================================================================================
    #===============================I. PARSING ARGUMENTS==================================
    #=====================================================================================
    #pulling in the data table
    data_file = os.path.abspath(args.data_table)
    dataDict = pipeline_dfci.loadDataTable(data_file)

    #setting naming conventions
    if not args.name:
        analysis_name = data_file.split('/')[-1].split('.')[0]
    else:
        analysis_name = args.name

    #getting the optional input gff
    if args.input:
        inputGFF = args.input
    else:
        inputGFF = ''

    #getting group names
    group1_name = args.group1_name
    group2_name = args.group2_name

    #getting group1 
    group1_string = args.group1
    group1_list = [name for name in string.split(group1_string,',') if len(name) > 0]

    #getting group2
    group2_string = args.group2
    group2_list = [name for name in string.split(group2_string,',') if len(name) > 0]

    #checking that all datasets are in the data table
    for name in group1_list + group2_list:
        if name not in dataDict:
            print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file))
            sys.exit()

    #loading in the genome object from the data table
    genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list])
    if len(genome_list) > 1:
        print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.')
        sys.exit()

    
    #the load genome function has an assertion test to make sure the genome is supported
    genome = loadGenome(genome_list[0])

    
    parent_folder = utils.formatFolder(args.output,True)
    output_folder = utils.formatFolder(parent_folder + analysis_name,True)


    #these are the user defined optional arguments
    tss = int(args.tss)

    stitch = args.stitch
    print('stitch')
    print(stitch)

    
    #list of active genes to constrain analysis 
    if len(args.activity) == 0:
        #assumes all genes are active unless told otherwise
        #activity_path,activity_table = getActivity() # fix this function
        print('using all active genes')
    else:
        activity_path = args.activity
        activity_table = utils.parseTable(activity_path,'\t')




    print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n')

    print('Analyzing datasets described in %s\n' % (data_file))

    print('Name for the analysis: %s\n' % (analysis_name))
    print('Using genome: %s\n' % (genome.name()))


    
    print('%s datasets: %s\n' % (group1_name,group1_string))
    print('%s datasets: %s\n' % (group2_name,group2_string))

    if len(activity_path) > 0:
        print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path))
    else:
        print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name))
    print('Writing output to: %s\n' % (output_folder))


    #=====================================================================================
    #======================II. DEFINING CIS-REGULATORY ELEMENTS===========================
    #=====================================================================================


    print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n')



    #crc_wrapper will act at the group level and not consider individual datasets
    #since a data table is used as the input, the code will rely heavily on pipeline_dfci
    #embedded tools

    #1. first we need to run meta rose using default parameters and check the output
    #exists for each group

    meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True)

    group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name)

    group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name)
    #print(group1_output)
    #print(group2_output)

    #for each output check to see if they exist
    #if not launch

    try:
        foo = open(group1_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name))
        launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss)
        
    try:
        foo = open(group2_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name))
        launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss)



    #now check for completion
    if utils.checkOutput(group1_output,1,10):
        print('META_ROSE finished for %s' % (group1_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name))
        sys.exit()

    if utils.checkOutput(group2_output,1,10):
        print('META_ROSE finished for %s' % (group2_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name))
        sys.exit()


    #Meta rose does not give all regions that are SE in at least one sample
    #and can be blown out by amplicons etc...
    #sooo we need to run clustering to generate a good input gff
    #ideally we just rewrite dynamic meta to run off of clustering output
    #until we do that let's just overwrite w/ an input gff
    

    print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name))
    dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True)

    #here we will use the rank table as the primary output
    dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name)
    
    try:
        foo = open(dynamic_rose_output,'r')
    except IOError:
        print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name))
        launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF)

    if utils.checkOutput(dynamic_rose_output,1,10):
        print('DYNAMIC_ROSE finsihed for %s' % (analysis_name))
    else:
        print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name))
        sys.exit()




    #=====================================================================================
    #======================III. IDENTIFYING TF NODES IN NETWORK===========================
    #=====================================================================================


    print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n')




    #now we want to call circuitry on each group... ok to have different subpeaks and motif calls
    #if as a first approximation we weight by the overall enhancer




    crc_folder = utils.formatFolder('%scrc/' % (output_folder),True)



    #for all
    all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path)



    #for group1
    group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path)

    #for group2
    group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)