def mapper(options, line): bedtool = pybedtools.BedTool(line, from_string=True) # loads in the last bedline, because bedtools doesn't have a .next() for bedline in bedtool: pass if options.premRNA: length = bedline.stop - bedline.start else: length = sum( [int(x) for x in bedline[10][:-1].strip().split(",")] ) # Just gets the lengths of the exons (although no mention of cds or not... not important) print call_peaks( [bedline.chrom, bedline.name, bedline.start, bedline.stop, bedline.strand], length, None, options.bam, int(options.margin), options.FDR_alpha, options.threshold, int(options.minreads), options.poisson_cutoff, options.plotit, 10, 1000, options.SloP, False, )
def mapper(options, line): bedtool = pybedtools.BedTool(line, from_string=True) #loads in the last bedline, because bedtools doesn't have a .next() for bedline in bedtool: pass if options.premRNA: length = bedline.stop - bedline.start else: length = sum([int(x) for x in bedline[10][:-1].strip().split(",")]) #Just gets the lengths of the exons (although no mention of cds or not... not important) print call_peaks([bedline.chrom, bedline.name, bedline.start, bedline.stop, bedline.strand], length, options.bam, int(options.max_gap), options.FDR_alpha, options.threshold, int(options.minreads), options.poisson_cutoff, options.plotit, 10, 1000, options.SloP, False)
def main(options): ############################################## # logging.info("options : {}".format(options)) ############################################## check_for_index(options.bam) if options.np == 'autodetect': options.np = multiprocessing.cpu_count() pool = multiprocessing.Pool(int(options.np)) bamfile = options.bam if os.path.exists(bamfile): #re-set to include the full path to bamfile bamfile = os.path.abspath(bamfile) logging.info("bam file is set to %s\n" % (bamfile)) else: logging.error("Bam file: %s is not defined" % (bamfile)) raise IOError if options.gtfFile: # TODO always False - no longer an option bedtool = build_transcript_data_gtf( pybedtools.BedTool(options.gtfFile), options.premRNA) else: bedtool = build_transcript_data_gtf_as_structure( options.species, options.premRNA) bedtool.saveas() #gets a bedtool of all genes to call peaks on if options.gene: bedtool = bedtool.filter(lambda x: x.attrs['gene_id'] in options.gene) # options.maxgenes # truncates for max bedtool if options.maxgenes: ######################################################################## logging.info(" number of genes before maxing : {}".format( len(bedtool))) logging.info(" max genes from user input: {}".format(options.maxgenes)) ######################################################################## if options.maxgenes < len(bedtool): bedtool = bedtool.random_subset(int(options.maxgenes)) else: logging.info( " number of genes <= max genes from user , not truncating genes" ) pass exons = get_exon_bed(options.species) bedtool = bedtool.saveas() tasks = [ ( bedtool_interval, bedtool_interval.attrs['effective_length'], bamfile, options.max_gap, options.FDR_alpha, options.threshold, options.binom, options.method, options.minreads, options.poisson_cutoff, options.plotit, 10, 1000, options.SloP, options.max_width, options.min_width, options.algorithm, # TODO options.algorithm now always "spline" ! options.reverse_strand, exons) for gene_no, bedtool_interval in enumerate(bedtool) ] ################################## # print("len(tasks):", len(tasks)) ################################## #jobs = [] peaks_dicts = [] # generate list of all peaks_dict's, (one peaks_dict per gene) ############################################################## if options.debug: peaks_dicts = [call_peaks(*task) for task in tasks] else: jobs = [pool.apply_async(call_peaks, task) for task in tasks] for job, task in zip(jobs, tasks): try: peaks_dicts.append(job.get(timeout=options.timeout)) except multiprocessing.TimeoutError as error: print() #################################################################################################################################### logging.error( "gene %s timed out after %s minutes on bedinterval: %s" % (task[0].attrs['gene_id'], options.timeout / 60, task[0])) #################################################################################################################################### pool.close() ################################################################# logging.info("finished call_peaks on all genes") ################################################################# ############################################################ logging.info(" starting adding up transcriptome-wise reads") ############################################################ transcriptome_reads = count_transcriptome_reads(peaks_dicts) transcriptome_size = count_transcriptome_length(peaks_dicts) #################################################################################### logging.info(" transcriptome size in bases: {}".format(transcriptome_size)) logging.info( " transcriptome total number of reads: {}".format(transcriptome_reads)) #################################################################################### filtered_peak_bedtool_tsv = filter_peaks_dicts(peaks_dicts, options.poisson_cutoff, transcriptome_size, transcriptome_reads, options.use_global_cutoff, options.bonferroni_correct, options.algorithm, options.SloP, options.min_width, bypassfiltering=False) ########################################################## # logging.info(" 1: {}".format(filtered_peak_bedtool_tsv)) ########################################################## ############### # writing files ############### # options.outfileF, options.save_pickle #====================================== outbedF = options.outfileF wether_to_save_pickle = options.save_pickle # # writing tsv files #================== with open(outbedF + ".tsv", 'w') as tsvfile: tsvfile.write(filtered_peak_bedtool_tsv) # # writing bed files #================== pybedtools.BedTool(filtered_peak_bedtool_tsv, from_string=True).sort(stream=True).saveas(outbedF) ######################################################## #logging.info(" wrote filtered peaks to %s" % (outbedF)) ######################################################## # # writing pickle files #===================== if wether_to_save_pickle is True: with open( outbedF + ".pickle", 'w' ) as picklefile: # TODO Can't pickle save after filtering ? as we have a tsv now, not a peaks_dicts list !? pickle.dump(peaks_dicts, file=picklefile)
def func_star(varables): """ covert f([1,2]) to f(1,2) """ return call_peaks(*varables)
def main(options): """ Run the whole pipeline :rtype: None """ ############################################## # logging.info("options : {}".format(options)) ############################################## ############ CHECKING FILE STATUS ############ check_for_index(options.bam) bamfile = options.bam if os.path.exists(bamfile): # re-set to include the full path to bamfile bamfile = os.path.abspath(bamfile) logging.info("bam file is set to %s\n" % (bamfile)) else: logging.error("Bam file: %s is not defined" % (bamfile)) raise IOError ########### PREPARE GENE LENGTH ################ # if options.gtfFile: # # TODO always False - no longer an option # bedtool = build_transcript_data_gtf(pybedtools.BedTool(options.gtfFile), options.premRNA) # else: bedtool = build_transcript_data_gtf_as_structure(options.species, options.premRNA).saveas() # gets a bedtool of all genes to call peaks on if options.gene: bedtool = bedtool.filter(lambda x: x.attrs['gene_id'].split('.')[0] in options.gene).saveas() ### bug # options.maxgenes # truncates for max bedtool if options.maxgenes: logging.info(" number of genes before maxing : {}".format(len(bedtool))) logging.info(" max genes from user input: {}".format(options.maxgenes)) ######################################################################## if options.maxgenes < len(bedtool): bedtool = bedtool.random_subset(int(options.maxgenes)).saveas() else: logging.info(" number of genes <= max genes from user , not truncating genes") pass if len(bedtool) == 0: raise Warning('Bedtool length is 0; check gene id') exons = get_exon_bed(options.species) ############### PREPARE MULTIPROCESSING ############################## if options.np == 'autodetect': options.np = multiprocessing.cpu_count() pool = multiprocessing.Pool(int(options.np)) tasks = [(bedtool_interval, bedtool_interval.attrs['effective_length'], bamfile, options.max_gap, options.FDR_alpha, options.threshold, options.binom, options.method, options.minreads, options.poisson_cutoff, options.plotit, 10, 1000, options.SloP, options.max_width, options.min_width, options.algorithm, # TODO options.algorithm now always "spline" ! options.reverse_strand, exons ) for gene_no, bedtool_interval in enumerate(bedtool)] logging.info('Total tasks: {}'.format(len(tasks))) ############## CALL PEAKS BY HEIGHT AND CURVE########################## # generate list of all peaks_dict's, (one peaks_dict per gene) peaks_dicts = [] if options.debug: peaks_dicts = [call_peaks(*task) for task in tasks] else: jobs = [pool.apply_async(call_peaks, task) for task in tasks] for job, task in zip(jobs, tasks): try: peaks_dicts.append(job.get(timeout=options.timeout)) except multiprocessing.TimeoutError as error: print() logging.error("gene %s timed out after %s minutes on bedinterval: %s" % ( task[0].attrs['gene_id'], options.timeout / 60, task[0])) pool.close() logging.info("finished call_peaks on all genes") ################### FILTER PEAK BY READ ################################# logging.info(" starting adding up transcriptome-wise reads") transcriptome_reads = count_transcriptome_reads(peaks_dicts) transcriptome_size = count_transcriptome_length(peaks_dicts) logging.info(" transcriptome size in bases: {}".format(transcriptome_size)) logging.info(" transcriptome total number of reads: {}".format(transcriptome_reads)) #################################################################################### filtered_peak_bedtool_tsv = filter_peaks_dicts(peaks_dicts, options.poisson_cutoff, transcriptome_size, transcriptome_reads, options.use_global_cutoff, options.bonferroni_correct, options.algorithm, options.SloP, options.min_width, bypassfiltering=False) ############### WRITE TO FILE ##################################### if type(filtered_peak_bedtool_tsv) == str: # with open(outbedF + ".tsv", 'w') as tsvfile: # tsvfile.write(filtered_peak_bedtool_tsv) # filtered_peak_bedtool_dataframe.to_csv(tsvfile, sep = '\t') pybedtools.BedTool(filtered_peak_bedtool_tsv, from_string=True).sort(stream=True).saveas(options.outfileF) if options.save_pickle is True: with open(options.outfileF + ".pickle", 'w') as f: # TODO Can't pickle save after filtering ? as we have a tsv now, not a peaks_dicts list !? pickle.dump(peaks_dicts, file=f)