def callRiger(rigerTableFile, scoring='KSbyScore', output='', callRiger=True): ''' calls riger using the KS scoring metric (default) ''' rigerDirectory = '/raider/temp/riger/' rigerTableAbsFile = os.path.abspath(rigerTableFile) outputFolder = utils.getParentFolder(rigerTableAbsFile) if len(output) == 0: output = string.replace(rigerTableAbsFile, '_friger.txt', '_friger_%s_out.txt' % (scoring)) rigerBashFileName = string.replace(rigerTableAbsFile, '_friger.txt', '_callRiger.sh') rigerBashFile = open(rigerBashFileName, 'w') rigerBashFile.write('#!/usr/bin/bash\n') rigerBashFile.write('cd %s\n\n' % (rigerDirectory)) rigerCmd = 'java -cp commons-cli-1.2.jar:rigerj-1.6.2.jar org.broadinstitute.rnai.rigerj.RigerJMain -scoringMethod %s -inputFile %s -outputFile %s' % ( scoring, rigerTableAbsFile, output) rigerBashFile.write(rigerCmd) rigerBashFile.write('\n') rigerBashFile.close() print("WROTE RIGER CMD TO %s" % (rigerBashFileName)) if callRiger == True: print("Calling RIGER with %s scoring method" % (scoring)) print("RIGER CMD: %s" % (rigerCmd)) os.system(rigerBashFileName) return rigerBashFileName
def callRiger(rigerTableFile,scoring='KSbyScore',output='',callRiger = True): ''' calls riger using the KS scoring metric (default) ''' rigerDirectory = '/raider/temp/riger/' rigerTableAbsFile = os.path.abspath(rigerTableFile) outputFolder = utils.getParentFolder(rigerTableAbsFile) if len(output) == 0: output = string.replace(rigerTableAbsFile,'_friger.txt','_friger_%s_out.txt' % (scoring)) rigerBashFileName = string.replace(rigerTableAbsFile,'_friger.txt','_callRiger.sh') rigerBashFile = open(rigerBashFileName,'w') rigerBashFile.write('#!/usr/bin/bash\n') rigerBashFile.write('cd %s\n\n' % (rigerDirectory)) rigerCmd = 'java -cp commons-cli-1.2.jar:rigerj-1.6.2.jar org.broadinstitute.rnai.rigerj.RigerJMain -scoringMethod %s -inputFile %s -outputFile %s' % (scoring,rigerTableAbsFile,output) rigerBashFile.write(rigerCmd) rigerBashFile.write('\n') rigerBashFile.close() print("WROTE RIGER CMD TO %s" % (rigerBashFileName)) if callRiger == True: print("Calling RIGER with %s scoring method" % (scoring)) print("RIGER CMD: %s" % (rigerCmd)) os.system(rigerBashFileName) return rigerBashFileName
def makeNameDict(dataFile,roseFolder,namesList=[]): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) #check to see if a rose folder exists already if utils.formatFolder(roseFolder,False): roseExists = True roseFolder = utils.formatFolder(roseFolder,False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder,True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [name for name in dataDict.keys() if dataDict[name]['background'] != 'NONE'] #this filters out control WCE datatsets #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs']) print enrichedFile try: foo = open(enrichedFile,'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name)) allEnhancerFileList = [x for x in roseOutputFiles if x.count("AllEnhancers.table.txt") == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name) sys.exit() return nameDict
def launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF): ''' launches meta rose ''' project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False) dataDict = pipeline_dfci.loadDataTable(data_file) genome_build = genome.name() meta_rose_output_1 = utils.formatFolder(meta_rose_folder + group1_name,True) meta_rose_output_2 = utils.formatFolder(meta_rose_folder + group2_name,True) meta_rose_string = ','.join([meta_rose_output_1,meta_rose_output_2]) #setting the output dynamic_rose_folder = utils.formatFolder(dynamic_rose_folder,True) group1_string = ','.join(group1_list) group2_string = ','.join(group2_list) dynamic_cmd = 'python %sdynamicEnhancer_meta.py -g %s -d %s -r %s -o %s --group1 %s --group2 %s --name1 %s --name2 %s -p -m' % (pipeline_dir,genome_build,data_file,meta_rose_string,dynamic_rose_folder,group1_string,group2_string,group1_name,group2_name) if len(inputGFF) > 0: dynamic_cmd += ' --input %s' % (inputGFF) bash_path ='%s%s_dynamic_meta.sh' % (dynamic_rose_folder,analysis_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (pipeline_dir)) bash_file.write(dynamic_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote DYNAMIC_META command for %s to %s' % (analysis_name,bash_path)) print('Launching DYNAMIC_META_ROSE') os.system('bash %s' % (bash_path))
def launchMetaRose(group_name,group_list,meta_rose_folder,genome,data_file,stitch,tss): ''' launches meta rose ''' project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False) macs_folder = '%smacsEnriched/' % (project_folder) #quick hack to make sure input beds are in the right place dataDict = pipeline_dfci.loadDataTable(data_file) meta_rose_output = utils.formatFolder(meta_rose_folder + group_name,True) genome_build = genome.name() input_string = ','.join(['%s%s' % (macs_folder,dataDict[name]['enrichedMacs']) for name in group_list]) bam_string = ','.join([dataDict[name]['bam'] for name in group_list]) meta_cmd = 'python %sROSE2_META.py -g %s -i %s -r %s -o %s -n %s -t %s' % (pipeline_dir,genome_build,input_string,bam_string,meta_rose_output,group_name,tss) if stitch != None: meta_cmd += ' -s %s' % (stitch) #adding a mask if necessary if genome.hasFeature('mask'): meta_cmd += ' --mask %s' % (genome.returnFeature('mask')) bash_path ='%s%s_meta_rose.sh' % (meta_rose_output,group_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (pipeline_dir)) bash_file.write(meta_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote META_ROSE command for %s to %s' % (group_name,bash_path)) print('Launching META_ROSE') os.system('bash %s' % (bash_path))
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser( usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME') # required flags parser.add_argument("-i", "--input", dest="inputSam", type=str, help="Enter a sam file", required=False) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify the main reference genome", required=False) parser.add_argument("-s", "--spike", dest="spike", type=str, help="specify the spike in genome", required=False) parser.add_argument( "-d", "--dest", dest="dest", type=str, help="specify an optional destination for the final bams to move to", required=False) args = parser.parse_args() print(args) if args.inputSam and args.genome and args.spike: print( 'FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s' % (args.inputSam, args.genome, args.spike)) samPath = args.inputSam if string.upper(samPath).count('.SAM') == 0: print('ERROR, file must end in .sam or .SAM') sys.exit() #get the headers genome_string = string.upper('%s_%s' % (args.genome, args.spike)) genomeDict = { 'RN6_DM6': [ '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM', '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM' ], } if genomeDict.has_key(genome_string) == False: print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string)) sys.exit() else: print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' % (genome_string)) header1 = genomeDict[genome_string][0] header2 = genomeDict[genome_string][1] suffix = string.lower('_%s' % (args.spike)) outfile1 = string.replace(samPath, samPath[-4:], '.%s%s' % (args.genome, samPath[-4:])) outfile2 = string.replace(samPath, samPath[-4:], '.%s%s' % (args.spike, samPath[-4:])) split_sam(suffix, samPath, header1, header2, outfile1, outfile2) #move stuff to destination folder if args.dest: bamFolder = utils.formatFolder(args.dest, False) samFolder = utils.getParentFolder(samPath) mv_cmd = 'mv %s*bam* %s' % (samFolder, bamFolder) print('MOVING BAMS FROM %s TO %s' % (samFolder, bamFolder)) os.system(mv_cmd) else: parser.print_help() sys.exit()
def makeNameDict(dataFile, roseFolder, namesList=[], enhancerType='super'): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) print "Using %s as the parent folder" % (parentFolder) #check to see if a rose folder exists already if utils.formatFolder(roseFolder, False): roseExists = True roseFolder = utils.formatFolder(roseFolder, False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder, True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [ name for name in dataDict.keys() if string.upper(name).count('WCE') == 0 and string.upper(name).count('INPUT') == 0 ] #if no namesList is given, this filters out WCE #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #check if each dataset has a background backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): nameDict[name]['background'] = True else: nameDict[name]['background'] = False #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder, dataDict[name]['enrichedMacs']) print "Looking for macs output at %s" % (enrichedFile) try: foo = open(enrichedFile, 'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: try: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder, name)) if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' allEnhancerFileList = [ x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % ( roseFolder, name, allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' except OSError: nameDict[name]['enhancerFile'] = '' else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name][ 'enrichedFile'] == '': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % ( name) print nameDict[name] sys.exit() return nameDict
def extractGuideFastq(fastqFile,outputFolder='',gzip=False): ''' takes a fastq and extracts candidate guide RNAs ''' #get the full absolute path for the fastq File fastqFile = os.path.abspath(fastqFile) fastq = utils.open(fastqFile,'r') #get the fastq name and root if len(outputFolder) == 0: outputFolder = utils.getParentFolder(fastqFile) #makes sure the output folder exists utils.formatFolder(outputFolder,True) #grab the name info from the fastq fastqName = fastqFile.split('/')[-1] fastqRoot = string.replace(fastqName,'.fastq','') fastqRoot = string.replace(fastqRoot,'.gz','') #guideFastqFile output guideFastqFile = '%s%s.gecko.fastq' % (outputFolder,fastqRoot) guideFastq = utils.open(guideFastqFile,'w') print('processing %s' % (fastqName)) print('million reads processed:') ticker = 0 found = 0 while True: if ticker%1000000 == 0: print(ticker/1000000) fastqLines = [] #now load the fastq lines try: for i in range(4): fastqLines.append(fastq.next()) except StopIteration: break #see if you can find a cut site seq = fastqLines[1].rstrip() try: cutPosition = seq.index(cutSeq) found+=1 except ValueError: ticker+=1 continue guideStart = cutPosition + cutOffset guideStop = guideStart + guideLength #pulling out the guide seq in the fastqLines fastqLines[1] = fastqLines[1][guideStart:guideStop] + '\n' fastqLines[3] = fastqLines[3][guideStart:guideStop] + '\n' for line in fastqLines: guideFastq.write(line) ticker+=1 # if ticker == 100000: # print(ticker) # print(found) # print(float(found)/float(ticker)) # break print('SUMMARY STATISTICS') print(ticker) print(found) print(float(found)/float(ticker)) #close the fastq guideFastq.close() #gzip the fastq if gzip: os.system('gzip %s &' % (guideFastqFile)) guideFastqFile += '.gz' return guideFastqFile
def makeNameDict(dataFile,roseFolder,namesList=[],enhancerType='super'): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) print "Using %s as the parent folder" % (parentFolder) #check to see if a rose folder exists already if utils.formatFolder(roseFolder,False): roseExists = True roseFolder = utils.formatFolder(roseFolder,False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder,True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [name for name in dataDict.keys() if string.upper(name).count('WCE') ==0 and string.upper(name).count('INPUT') == 0 ] #if no namesList is given, this filters out WCE #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #check if each dataset has a background backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): nameDict[name]['background'] = True else: nameDict[name]['background'] = False #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs']) print "Looking for macs output at %s" % (enrichedFile) try: foo = open(enrichedFile,'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: try: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name)) if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' allEnhancerFileList = [x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' except OSError: nameDict[name]['enhancerFile']='' else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name) print nameDict[name] sys.exit() return nameDict
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser(usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME') # required flags parser.add_argument("-i", "--input", dest="inputSam", type=str, help="Enter a sam file", required=False) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify the main reference genome", required=False) parser.add_argument("-s", "--spike", dest="spike", type=str, help="specify the spike in genome", required=False) parser.add_argument("-d", "--dest", dest="dest", type=str, help="specify an optional destination for the final bams to move to", required=False) args = parser.parse_args() print(args) if args.inputSam and args.genome and args.spike: print('FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s' % (args.inputSam,args.genome,args.spike)) samPath = args.inputSam if string.upper(samPath).count('.SAM') == 0: print('ERROR, file must end in .sam or .SAM') sys.exit() #get the headers genome_string = string.upper('%s_%s' % (args.genome,args.spike)) genomeDict = {'RN6_DM6':['/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM','/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM'], } if genomeDict.has_key(genome_string) == False: print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string)) sys.exit() else: print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' %(genome_string)) header1 = genomeDict[genome_string][0] header2 = genomeDict[genome_string][1] suffix = string.lower('_%s' % (args.spike)) outfile1 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.genome,samPath[-4:])) outfile2 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.spike,samPath[-4:])) split_sam(suffix,samPath,header1,header2,outfile1,outfile2) #move stuff to destination folder if args.dest: bamFolder = utils.formatFolder(args.dest,False) samFolder = utils.getParentFolder(samPath) mv_cmd = 'mv %s*bam* %s' % (samFolder,bamFolder) print('MOVING BAMS FROM %s TO %s' % (samFolder,bamFolder)) os.system(mv_cmd) else: parser.print_help() sys.exit()
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) #get the chromLists from the various bams here cmd = 'samtools idxstats %s' % (rankByBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] if len(controlBamFile) > 0: cmd = 'samtools idxstats %s' % (controlBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] #now make sure no genes have a bad chrom overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] #now make an enhancer collection of all transcripts enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedRankByFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedControlFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def extractGuideFastq(fastqFile, outputFolder='', gzip=False): ''' takes a fastq and extracts candidate guide RNAs ''' #get the full absolute path for the fastq File fastqFile = os.path.abspath(fastqFile) fastq = utils.open(fastqFile, 'r') #get the fastq name and root if len(outputFolder) == 0: outputFolder = utils.getParentFolder(fastqFile) #makes sure the output folder exists utils.formatFolder(outputFolder, True) #grab the name info from the fastq fastqName = fastqFile.split('/')[-1] fastqRoot = string.replace(fastqName, '.fastq', '') fastqRoot = string.replace(fastqRoot, '.gz', '') #guideFastqFile output guideFastqFile = '%s%s.gecko.fastq' % (outputFolder, fastqRoot) guideFastq = utils.open(guideFastqFile, 'w') print('processing %s' % (fastqName)) print('million reads processed:') ticker = 0 found = 0 while True: if ticker % 1000000 == 0: print(ticker / 1000000) fastqLines = [] #now load the fastq lines try: for i in range(4): fastqLines.append(fastq.next()) except StopIteration: break #see if you can find a cut site seq = fastqLines[1].rstrip() try: cutPosition = seq.index(cutSeq) found += 1 except ValueError: ticker += 1 continue guideStart = cutPosition + cutOffset guideStop = guideStart + guideLength #pulling out the guide seq in the fastqLines fastqLines[1] = fastqLines[1][guideStart:guideStop] + '\n' fastqLines[3] = fastqLines[3][guideStart:guideStop] + '\n' for line in fastqLines: guideFastq.write(line) ticker += 1 # if ticker == 100000: # print(ticker) # print(found) # print(float(found)/float(ticker)) # break print('SUMMARY STATISTICS') print(ticker) print(found) print(float(found) / float(ticker)) #close the fastq guideFastq.close() #gzip the fastq if gzip: os.system('gzip %s &' % (guideFastqFile)) guideFastqFile += '.gz' return guideFastqFile