Пример #1
0
def callRiger(rigerTableFile, scoring='KSbyScore', output='', callRiger=True):
    '''
    calls riger using the KS scoring metric (default)
    '''
    rigerDirectory = '/raider/temp/riger/'

    rigerTableAbsFile = os.path.abspath(rigerTableFile)
    outputFolder = utils.getParentFolder(rigerTableAbsFile)
    if len(output) == 0:
        output = string.replace(rigerTableAbsFile, '_friger.txt',
                                '_friger_%s_out.txt' % (scoring))
    rigerBashFileName = string.replace(rigerTableAbsFile, '_friger.txt',
                                       '_callRiger.sh')

    rigerBashFile = open(rigerBashFileName, 'w')

    rigerBashFile.write('#!/usr/bin/bash\n')
    rigerBashFile.write('cd %s\n\n' % (rigerDirectory))
    rigerCmd = 'java -cp commons-cli-1.2.jar:rigerj-1.6.2.jar org.broadinstitute.rnai.rigerj.RigerJMain -scoringMethod %s -inputFile %s -outputFile %s' % (
        scoring, rigerTableAbsFile, output)
    rigerBashFile.write(rigerCmd)
    rigerBashFile.write('\n')

    rigerBashFile.close()
    print("WROTE RIGER CMD TO %s" % (rigerBashFileName))
    if callRiger == True:
        print("Calling RIGER with %s scoring method" % (scoring))
        print("RIGER CMD: %s" % (rigerCmd))
        os.system(rigerBashFileName)
    return rigerBashFileName
Пример #2
0
def callRiger(rigerTableFile,scoring='KSbyScore',output='',callRiger = True):

    '''
    calls riger using the KS scoring metric (default)
    '''
    rigerDirectory = '/raider/temp/riger/'

    rigerTableAbsFile = os.path.abspath(rigerTableFile)
    outputFolder = utils.getParentFolder(rigerTableAbsFile)
    if len(output) == 0:
        output = string.replace(rigerTableAbsFile,'_friger.txt','_friger_%s_out.txt' % (scoring))
    rigerBashFileName = string.replace(rigerTableAbsFile,'_friger.txt','_callRiger.sh')

    rigerBashFile = open(rigerBashFileName,'w')

    rigerBashFile.write('#!/usr/bin/bash\n')
    rigerBashFile.write('cd %s\n\n' % (rigerDirectory))
    rigerCmd = 'java -cp commons-cli-1.2.jar:rigerj-1.6.2.jar org.broadinstitute.rnai.rigerj.RigerJMain -scoringMethod %s -inputFile %s -outputFile %s' % (scoring,rigerTableAbsFile,output)
    rigerBashFile.write(rigerCmd)
    rigerBashFile.write('\n')

    rigerBashFile.close()
    print("WROTE RIGER CMD TO %s" % (rigerBashFileName))
    if callRiger == True:
        print("Calling RIGER with %s scoring method" % (scoring))
        print("RIGER CMD: %s" % (rigerCmd))
        os.system(rigerBashFileName)
    return rigerBashFileName
Пример #3
0
def makeNameDict(dataFile,roseFolder,namesList=[]):

    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)
    
    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder,False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder,False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder,True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:
        namesList = [name for name in dataDict.keys() if dataDict[name]['background'] != 'NONE']
        #this filters out control WCE datatsets

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:
        
        nameDict[name] = {}
        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs'])
        print enrichedFile
        try:
            foo = open(enrichedFile,'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:

            roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name))
            allEnhancerFileList = [x for x in roseOutputFiles if x.count("AllEnhancers.table.txt") == 1 and x[0] != '.' ] #no weird hidden or temp files
            if len(allEnhancerFileList) > 0:
                nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0])
            else:
                nameDict[name]['enhancerFile'] = ''
        
        if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name)
            sys.exit()
    return nameDict
Пример #4
0
def launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF):

    '''
    launches meta rose
    '''

    project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False)

    dataDict = pipeline_dfci.loadDataTable(data_file)

    genome_build = genome.name()
    
    meta_rose_output_1 = utils.formatFolder(meta_rose_folder + group1_name,True)
    meta_rose_output_2 = utils.formatFolder(meta_rose_folder + group2_name,True)

    meta_rose_string = ','.join([meta_rose_output_1,meta_rose_output_2])

    #setting the output
    dynamic_rose_folder = utils.formatFolder(dynamic_rose_folder,True)

    group1_string = ','.join(group1_list)
    group2_string = ','.join(group2_list)
    
    dynamic_cmd = 'python %sdynamicEnhancer_meta.py -g %s -d %s -r %s -o %s --group1 %s --group2 %s --name1 %s --name2 %s -p -m' % (pipeline_dir,genome_build,data_file,meta_rose_string,dynamic_rose_folder,group1_string,group2_string,group1_name,group2_name)
    
    if len(inputGFF) > 0:
        dynamic_cmd += ' --input %s' % (inputGFF)

    bash_path ='%s%s_dynamic_meta.sh' % (dynamic_rose_folder,analysis_name)
    bash_file = open(bash_path,'w')
    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (pipeline_dir))
    bash_file.write(dynamic_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote DYNAMIC_META command for %s to %s' % (analysis_name,bash_path))
    print('Launching DYNAMIC_META_ROSE')
    os.system('bash %s' % (bash_path))
Пример #5
0
def launchMetaRose(group_name,group_list,meta_rose_folder,genome,data_file,stitch,tss):

    '''
    launches meta rose
    '''

    project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False)
    macs_folder = '%smacsEnriched/' % (project_folder) #quick hack to make sure input beds are in the right place
    dataDict = pipeline_dfci.loadDataTable(data_file)

    meta_rose_output = utils.formatFolder(meta_rose_folder + group_name,True)

    genome_build = genome.name()
    
    input_string = ','.join(['%s%s' % (macs_folder,dataDict[name]['enrichedMacs']) for name in group_list])
    bam_string = ','.join([dataDict[name]['bam'] for name in group_list])

    meta_cmd = 'python %sROSE2_META.py -g %s -i %s -r %s -o %s -n %s -t %s' % (pipeline_dir,genome_build,input_string,bam_string,meta_rose_output,group_name,tss)
    if stitch != None:
        meta_cmd += ' -s %s' % (stitch)

    #adding a mask if necessary
    if genome.hasFeature('mask'):
        meta_cmd += ' --mask %s' % (genome.returnFeature('mask'))

    bash_path ='%s%s_meta_rose.sh' % (meta_rose_output,group_name)
    bash_file = open(bash_path,'w')
    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (pipeline_dir))
    bash_file.write(meta_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote META_ROSE command for %s to %s' % (group_name,bash_path))
    print('Launching META_ROSE')
    os.system('bash %s' % (bash_path))
Пример #6
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(
        usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME')

    # required flags
    parser.add_argument("-i",
                        "--input",
                        dest="inputSam",
                        type=str,
                        help="Enter a sam file",
                        required=False)
    parser.add_argument("-g",
                        "--genome",
                        dest="genome",
                        type=str,
                        help="specify the main reference genome",
                        required=False)
    parser.add_argument("-s",
                        "--spike",
                        dest="spike",
                        type=str,
                        help="specify the spike in  genome",
                        required=False)

    parser.add_argument(
        "-d",
        "--dest",
        dest="dest",
        type=str,
        help="specify an optional destination for the final bams to move to",
        required=False)

    args = parser.parse_args()

    print(args)

    if args.inputSam and args.genome and args.spike:

        print(
            'FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s'
            % (args.inputSam, args.genome, args.spike))
        samPath = args.inputSam

        if string.upper(samPath).count('.SAM') == 0:
            print('ERROR, file must end in .sam or .SAM')
            sys.exit()

        #get the headers
        genome_string = string.upper('%s_%s' % (args.genome, args.spike))

        genomeDict = {
            'RN6_DM6': [
                '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM',
                '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM'
            ],
        }

        if genomeDict.has_key(genome_string) == False:
            print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string))
            sys.exit()
        else:
            print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' %
                  (genome_string))

        header1 = genomeDict[genome_string][0]
        header2 = genomeDict[genome_string][1]
        suffix = string.lower('_%s' % (args.spike))

        outfile1 = string.replace(samPath, samPath[-4:],
                                  '.%s%s' % (args.genome, samPath[-4:]))
        outfile2 = string.replace(samPath, samPath[-4:],
                                  '.%s%s' % (args.spike, samPath[-4:]))
        split_sam(suffix, samPath, header1, header2, outfile1, outfile2)

        #move stuff to destination folder
        if args.dest:
            bamFolder = utils.formatFolder(args.dest, False)

            samFolder = utils.getParentFolder(samPath)

            mv_cmd = 'mv %s*bam* %s' % (samFolder, bamFolder)
            print('MOVING BAMS FROM %s TO %s' % (samFolder, bamFolder))
            os.system(mv_cmd)

    else:
        parser.print_help()
        sys.exit()
Пример #7
0
def makeNameDict(dataFile, roseFolder, namesList=[], enhancerType='super'):
    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)
    print "Using %s as the parent folder" % (parentFolder)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder, False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder, False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder, True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:

        namesList = [
            name for name in dataDict.keys()
            if string.upper(name).count('WCE') == 0
            and string.upper(name).count('INPUT') == 0
        ]
        #if no namesList is given, this filters out WCE

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:

        nameDict[name] = {}

        #check if each dataset has a background

        backgroundName = dataDict[name]['background']
        if dataDict.has_key(backgroundName):
            nameDict[name]['background'] = True
        else:
            nameDict[name]['background'] = False

        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,
                                              dataDict[name]['enrichedMacs'])

        print "Looking for macs output at %s" % (enrichedFile)

        try:
            foo = open(enrichedFile, 'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:
            try:
                roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder, name))
                if enhancerType == 'super':
                    enhancerString = 'AllEnhancers.table.txt'
                if enhancerType == 'stretch':
                    enhancerString = 'AllEnhancers_Length.table.txt'
                if enhancerType == 'superstretch':
                    enhancerString = 'AllEnhancers_SuperStretch.table.txt'

                allEnhancerFileList = [
                    x for x in roseOutputFiles
                    if x.count(enhancerString) == 1 and x[0] != '.'
                ]  #no weird hidden or temp files
                if len(allEnhancerFileList) > 0:
                    nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (
                        roseFolder, name, allEnhancerFileList[0])
                else:
                    nameDict[name]['enhancerFile'] = ''
            except OSError:
                nameDict[name]['enhancerFile'] = ''
        else:
            nameDict[name]['enhancerFile'] = ''

        if nameDict[name]['enhancerFile'] == '' and nameDict[name][
                'enrichedFile'] == '':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (
                name)
            print nameDict[name]
            sys.exit()
    return nameDict
Пример #8
0
def extractGuideFastq(fastqFile,outputFolder='',gzip=False):

    '''
    takes a fastq and extracts candidate guide RNAs
    '''
    
    #get the full absolute path for the fastq File
    fastqFile = os.path.abspath(fastqFile)
    fastq = utils.open(fastqFile,'r')
    
    #get the fastq name and root
    if len(outputFolder) == 0:
        outputFolder = utils.getParentFolder(fastqFile)

    #makes sure the output folder exists
    utils.formatFolder(outputFolder,True)

    #grab the name info from the fastq
    fastqName = fastqFile.split('/')[-1]
    fastqRoot = string.replace(fastqName,'.fastq','')
    fastqRoot = string.replace(fastqRoot,'.gz','')
    
    #guideFastqFile output
    guideFastqFile = '%s%s.gecko.fastq' % (outputFolder,fastqRoot)
    guideFastq = utils.open(guideFastqFile,'w')

    print('processing %s' % (fastqName))
    print('million reads processed:')
    ticker = 0
    found = 0
    while True:
        
        if ticker%1000000 == 0:
            print(ticker/1000000)

        fastqLines = []

        #now load the fastq lines
        try:
            for i in range(4):
                fastqLines.append(fastq.next())
        except StopIteration:
            break

    
        #see if you can find a cut site
        seq = fastqLines[1].rstrip()
        try:
            cutPosition = seq.index(cutSeq)
            found+=1
        except ValueError:
            ticker+=1
            continue

        guideStart = cutPosition + cutOffset
        guideStop = guideStart + guideLength
        
        #pulling out the guide seq in the fastqLines
        fastqLines[1] = fastqLines[1][guideStart:guideStop] + '\n'
        fastqLines[3] = fastqLines[3][guideStart:guideStop] + '\n'

        for line in fastqLines:
            guideFastq.write(line)

        ticker+=1
        # if ticker == 100000:
        #     print(ticker)
        #     print(found)
        #     print(float(found)/float(ticker))
        #     break

    print('SUMMARY STATISTICS')
    print(ticker)
    print(found)
    print(float(found)/float(ticker))

    #close the fastq
    guideFastq.close()

    #gzip the fastq
    if gzip:
        os.system('gzip %s &' % (guideFastqFile))
        guideFastqFile += '.gz'

    return guideFastqFile
Пример #9
0
def makeNameDict(dataFile,roseFolder,namesList=[],enhancerType='super'):

    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)
    
    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)
    print "Using %s as the parent folder" % (parentFolder)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder,False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder,False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder,True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:


        namesList = [name for name in dataDict.keys() if string.upper(name).count('WCE') ==0 and string.upper(name).count('INPUT') == 0 ]
        #if no namesList is given, this filters out WCE 

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:
        
        nameDict[name] = {}

        #check if each dataset has a background

        backgroundName = dataDict[name]['background']
        if dataDict.has_key(backgroundName):
            nameDict[name]['background'] = True
        else:
            nameDict[name]['background'] = False

        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs'])
        
        print "Looking for macs output at %s" % (enrichedFile)

        try:
            foo = open(enrichedFile,'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:
            try:
                roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name))
                if enhancerType == 'super':
                    enhancerString = 'AllEnhancers.table.txt'
                if enhancerType == 'stretch':
                    enhancerString = 'AllEnhancers_Length.table.txt'
                if enhancerType == 'superstretch':
                    enhancerString = 'AllEnhancers_SuperStretch.table.txt'

                allEnhancerFileList = [x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files
                if len(allEnhancerFileList) > 0:
                    nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0])
                else:
                    nameDict[name]['enhancerFile'] = ''
            except OSError:
                nameDict[name]['enhancerFile']=''
        else:
            nameDict[name]['enhancerFile'] = ''
        
        if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name)
            print nameDict[name]
            sys.exit()
    return nameDict
Пример #10
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)

    outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    outputRank = outputRank.communicate()
    if len(outputRank[0]) > 0:  # test if mapping worked correctly
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.gff" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        outputControl = outputControl.communicate()
        if len(outputControl[0]) > 0:  # test if mapping worked correctly
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Пример #11
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME')

    # required flags
    parser.add_argument("-i", "--input", dest="inputSam", type=str,
                        help="Enter a sam file", required=False)
    parser.add_argument("-g", "--genome", dest="genome", type=str,
                        help="specify the main reference genome", required=False)
    parser.add_argument("-s", "--spike", dest="spike", type=str,
                        help="specify the spike in  genome", required=False)


    parser.add_argument("-d", "--dest", dest="dest", type=str,
                        help="specify an optional destination for the final bams to move to", required=False)

    args = parser.parse_args()

    print(args)
    
    if args.inputSam and args.genome and args.spike:

        print('FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s' % (args.inputSam,args.genome,args.spike))
        samPath = args.inputSam

        if string.upper(samPath).count('.SAM') == 0:
            print('ERROR, file must end in .sam or .SAM')
            sys.exit()

        
        #get the headers
        genome_string = string.upper('%s_%s' % (args.genome,args.spike))

        
        genomeDict = {'RN6_DM6':['/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM','/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM'],
                      }

        if genomeDict.has_key(genome_string) == False:
            print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string))
            sys.exit()
        else:
            print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' %(genome_string))


        header1 = genomeDict[genome_string][0]
        header2 = genomeDict[genome_string][1]
        suffix = string.lower('_%s' % (args.spike))

        outfile1 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.genome,samPath[-4:]))
        outfile2 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.spike,samPath[-4:]))
        split_sam(suffix,samPath,header1,header2,outfile1,outfile2)

        #move stuff to destination folder
        if args.dest:
            bamFolder = utils.formatFolder(args.dest,False)

            samFolder = utils.getParentFolder(samPath)

            mv_cmd = 'mv %s*bam* %s' % (samFolder,bamFolder)
            print('MOVING BAMS FROM %s TO %s' % (samFolder,bamFolder))
            os.system(mv_cmd)


    else:
        parser.print_help()
        sys.exit()
Пример #12
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    #get the chromLists from the various bams here
    cmd = 'samtools idxstats %s' % (rankByBamFile)
    idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
    idxStats= idxStats.communicate()
    bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
    
    if len(controlBamFile) > 0:
        cmd = 'samtools idxstats %s' % (controlBamFile)
        idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
        idxStats= idxStats.communicate()
        bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
        bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0]



    #now make sure no genes have a bad chrom 
    overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0]

    
    #now make an enhancer collection of all transcripts    
    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch'


    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)
    os.system(cmd)

    #check for completion
    if utils.checkOutput(mappedRankByFile,0.2,5):
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.txt" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        os.system(cmd)

        #check for completion
        if utils.checkOutput(mappedControlFile,0.2,5):
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Пример #13
0
def extractGuideFastq(fastqFile, outputFolder='', gzip=False):
    '''
    takes a fastq and extracts candidate guide RNAs
    '''

    #get the full absolute path for the fastq File
    fastqFile = os.path.abspath(fastqFile)
    fastq = utils.open(fastqFile, 'r')

    #get the fastq name and root
    if len(outputFolder) == 0:
        outputFolder = utils.getParentFolder(fastqFile)

    #makes sure the output folder exists
    utils.formatFolder(outputFolder, True)

    #grab the name info from the fastq
    fastqName = fastqFile.split('/')[-1]
    fastqRoot = string.replace(fastqName, '.fastq', '')
    fastqRoot = string.replace(fastqRoot, '.gz', '')

    #guideFastqFile output
    guideFastqFile = '%s%s.gecko.fastq' % (outputFolder, fastqRoot)
    guideFastq = utils.open(guideFastqFile, 'w')

    print('processing %s' % (fastqName))
    print('million reads processed:')
    ticker = 0
    found = 0
    while True:

        if ticker % 1000000 == 0:
            print(ticker / 1000000)

        fastqLines = []

        #now load the fastq lines
        try:
            for i in range(4):
                fastqLines.append(fastq.next())
        except StopIteration:
            break

        #see if you can find a cut site
        seq = fastqLines[1].rstrip()
        try:
            cutPosition = seq.index(cutSeq)
            found += 1
        except ValueError:
            ticker += 1
            continue

        guideStart = cutPosition + cutOffset
        guideStop = guideStart + guideLength

        #pulling out the guide seq in the fastqLines
        fastqLines[1] = fastqLines[1][guideStart:guideStop] + '\n'
        fastqLines[3] = fastqLines[3][guideStart:guideStop] + '\n'

        for line in fastqLines:
            guideFastq.write(line)

        ticker += 1
        # if ticker == 100000:
        #     print(ticker)
        #     print(found)
        #     print(float(found)/float(ticker))
        #     break

    print('SUMMARY STATISTICS')
    print(ticker)
    print(found)
    print(float(found) / float(ticker))

    #close the fastq
    guideFastq.close()

    #gzip the fastq
    if gzip:
        os.system('gzip %s &' % (guideFastqFile))
        guideFastqFile += '.gz'

    return guideFastqFile