def sRNAbench(reads, outpath, file_name, num_threads, species, Debug): sRNAbench_exe = set_config.get_exe("sRNAbench", Debug=Debug) ## set as option ## sRNAbench.jar in exec/ folder within sRNAtoolboxDB ## sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(sRNAbench_exe), '..')) ## sRNAtoolboxDB ## sRNAbench.jar linked in bin/. sRNAtoolboxDB in same folder sRNAbench_db = os.path.abspath( os.path.join(os.path.dirname(sRNAbench_exe), 'sRNAtoolboxDB')) ## sRNAtoolboxDB logfile = os.path.join(outpath, 'sRNAbench.log') if (len(reads) > 1): print( colored( "** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) exit() ## create command java_exe = set_config.get_exe('java', Debug=Debug) cmd = '%s -jar %s dbPath=%s input=%s output=%s' % ( java_exe, sRNAbench_exe, sRNAbench_db, reads[0], outpath) cmd = cmd + ' microRNA=%s isoMiR=true plotLibs=true graphics=true' % species cmd = cmd + ' plotMiR=true bedGraphMode=true writeGenomeDist=true' cmd = cmd + ' chromosomeLevel=true chrMappingByLength=true > ' + logfile return (functions.system_call_functions.system_call(cmd))
def miraligner(reads, outpath, file_name, database, species, Debug): miraligner_exe = set_config.get_exe("miraligner", Debug=Debug) logfile = os.path.join(outpath, 'miraligner.log') ## output outpath_file = os.path.join(outpath, file_name) if (len(reads) > 1): print( colored( "** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) exit() ## create tabular information of reads tabular_info = os.path.join(outpath, file_name + '-tab.freq.txt') fasta_functions.reads2tabular(reads[0], tabular_info) ## create command java_exe = set_config.get_exe('java', Debug=Debug) cmd = '%s -jar %s -db %s -sub 1 -add 3 -trim 3 -s %s -i %s -o %s 2> %s' % ( java_exe, miraligner_exe, database, species, tabular_info, outpath_file, logfile) return (functions.system_call_functions.system_call(cmd))
def multiQC_call(pathFile, name, folder, option): """ multiQC_ report generation call. :param pathFile: File containing list of files to include in report. :param name: Name to include in the html report. :param folder: Absolute path for the output folder. :param option: Options to provide to multiQC call. :type pathFile: string :type name: string :type folder: string :type option: string :returns: :func:`XICRA.scripts.functions.system_call_functions.system_call` output (OK/FALSE) .. seealso:: This function depends on other XICRA functions called: - :func:`XICRA.scripts.functions.system_call_functions.system_call` """ multiqc_bin = set_config.get_exe("multiqc") ## set options for call cmd = "%s --force -o %s -n %s -l %s -p -i 'MultiQC report' -b 'HTML report generated for multiple samples and steps' %s" %(multiqc_bin, folder, name, pathFile, option) ## if a report was previously generated in the folder ## force to delete and generate a new one return(functions.system_call_functions.system_call(cmd))
def print_dependencies(): """ """ progs = {} depencencies_pd = read_dependencies() for prog in depencencies_pd: #print (prog) prog_exe = set_config.get_exe(prog) #print (prog + '\t' + prog_exe) prog_ver = get_version(prog, prog_exe) progs[prog] = [prog_exe, prog_ver] df_programs = pd.DataFrame.from_dict(progs, orient='index', columns=('Executable path', 'Version')) df_programs = df_programs.stack().str.lstrip().unstack() pd.set_option('display.max_colwidth', -1) pd.set_option('display.max_columns', None) print (df_programs)
def fastqjoin_caller(list_reads, sample_folder, name, threads, perc_diff, Debug): ## check if previously joined and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = functions.time_functions.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s -- %s]" % (stamp, name, 'fastqjoin'), 'yellow')) else: # Call fastqjoin fastqjoin_exe = set_config.get_exe('fastqjoin') code_returned = fastqjoin(fastqjoin_exe, list_reads, sample_folder, name, threads, perc_diff, Debug) if code_returned: functions.time_functions.print_time_stamp(filename_stamp) else: print('** Sample %s failed...' % name)
def cutadapt_caller(list_reads, sample_folder, name, threads, Debug, adapters, extra): ## check if previously trimmed and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = functions.time_functions.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s -- %s]" % (stamp, name, 'cutadapt'), 'yellow')) else: # Call cutadapt cutadapt_exe = set_config.get_exe('cutadapt') code_returned = cutadapt(cutadapt_exe, list_reads, sample_folder, name, threads, Debug, adapters, extra) if code_returned: functions.time_functions.print_time_stamp(filename_stamp) else: print('** Sample %s failed...' % name)
def RNAbiotype_module_call(samples_dict, output_dict, gtf_file, Debug, max_workers_int, threads_job): """ Create RNAbiotype analysis for each sample and create summary plots :param samples_dict: Dictionary containing sample IDs as keys and bam files as values :param output_dict: Dictionary containing sample IDs as keys and output folder as values :param gtf_file: Gene annotation file for the reference genome used. :param threads: Number of threads to use. :param Debug: True/False for debugging messages """ ## get bin featureCount_exe = set_config.get_exe('featureCounts') ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(biotype_all, featureCount_exe, output_dict[sample], gtf_file, bam_files, sample, threads_job, Debug): sample for sample, bam_files in samples_dict.items() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## ## plot results for name, folder in output_dict.items(): RNAbiotypes_stats_file = os.path.join(folder, name + '_RNAbiotype.tsv') if files_functions.is_non_zero_file(RNAbiotypes_stats_file): pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug) return ()
def run_module_fastqc(path, files, sample, threads): ## Arguments provided via ARGVs ## check if previously done and succeeded filename_stamp = path + '/.success' if os.path.isfile(filename_stamp): stamp = functions.time_functions.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s -- %s]" % (stamp, name, 'fastqc'), 'yellow')) else: ## call fastqc fastqc_bin = set_config.get_exe('fastqc') codeReturn = call_fastqc(path, files, sample, fastqc_bin, threads) if codeReturn: functions.time_functions.print_time_stamp(filename_stamp) return ()
def optimir(reads, outpath, file_name, num_threads, matureFasta, hairpinFasta, miRNA_gff, Debug): optimir_exe = set_config.get_exe("optimir", Debug=Debug) sRNAbench_db = os.path.abspath( os.path.join(os.path.dirname(optimir_exe), '..')) ## optimir logfile = os.path.join(outpath, 'optimir.log') errfile = os.path.join(outpath, 'optimir.err') if (len(reads) > 1): print( colored( "** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) exit() ## create command cmd = "%s process --fq %s --gff_out -o %s --maturesFasta %s --hairpinsFasta %s --gff3 %s > %s 2> %s" % ( optimir_exe, reads[0], outpath, matureFasta, hairpinFasta, miRNA_gff, logfile, errfile) return (functions.system_call_functions.system_call(cmd))
def miRTop(results_folder, sample_folder, name, threads, format, miRNA_gff, hairpinFasta, species, Debug): miRTop_exe = set_config.get_exe('miRTop', Debug=Debug) logfile = os.path.join(sample_folder, name + '.log') ## folders mirtop_folder_gff = os.path.join(sample_folder, 'gff') mirtop_folder_stats = os.path.join(sample_folder, 'stats') mirtop_folder_counts = os.path.join(sample_folder, 'counts') mirtop_folder_export = os.path.join(sample_folder, 'export') ## get info according to software if format == "sRNAbench": ## get sRNAbench info reads_annot = os.path.join(results_folder, "reads.annotation") ## check non zero if not functions.files_functions.is_non_zero_file(reads_annot): print( colored( "\tNo isomiRs detected for sample [%s -- %s]" % (name, 'sRNAbench'), 'yellow')) return (False) elif format == "optimir": ## get optimir info gff3_file = functions.main_functions.retrieve_matching_files( os.path.join(results_folder, "OptimiR_Results"), "gff3", Debug)[0] results_folder = gff3_file ## check non zero if not functions.files_functions.is_non_zero_file(gff3_file): print( colored( "\tNo isomiRs detected for sample [%s -- %s]" % (name, 'optimir'), 'yellow')) return (False) elif format == "seqbuster": ## get miraligner info mirna_file = functions.main_functions.retrieve_matching_files( results_folder, ".mirna", Debug)[0] results_folder = mirna_file ## check non zero if not functions.files_functions.is_non_zero_file(mirna_file): print( colored( "\tNo isomiRs detected for sample [%s -- %s]" % (name, 'miraligner'), 'yellow')) return (False) ## miRTop analysis gff filename_stamp_gff = mirtop_folder_gff + '/.success' if os.path.isfile(filename_stamp_gff): stamp = functions.time_functions.read_time_stamp(filename_stamp_gff) print( colored( "\tA previous command generated results on: %s [%s -- %s - gff]" % (stamp, name, 'miRTop'), 'yellow')) else: print('Creating isomiRs gtf file for sample %s' % name) cmd = miRTop_exe + ' gff --sps %s --hairpin %s --gtf %s --format %s -o %s %s 2> %s' % ( species, hairpinFasta, miRNA_gff, format, mirtop_folder_gff, results_folder, logfile) ## execute code_miRTop = functions.system_call_functions.system_call(cmd) if code_miRTop: functions.time_functions.print_time_stamp(filename_stamp_gff) else: return (False) ## miRTop stats mirtop_folder_gff_file = os.path.join(mirtop_folder_gff, 'mirtop.gff') #filename_stamp_stats = mirtop_folder_stats + '/.success' #if os.path.isfile(filename_stamp_stats): # stamp = functions.time_functions.read_time_stamp(filename_stamp_stats) # print (colored("\tA previous command generated results on: %s [%s -- %s - stats]" %(stamp, name, 'miRTop'), 'yellow')) #else: # print ('Creating isomiRs stats for sample %s' %name) # cmd_stats = miRTop_exe + ' stats -o %s %s 2>> %s' %(mirtop_folder_stats, mirtop_folder_gff_file, logfile) # code_miRTop_stats = functions.system_call_functions.system_call(cmd_stats) # if code_miRTop_stats: # functions.time_functions.print_time_stamp(filename_stamp_stats) # else: # return(False) ## miRTop counts filename_stamp_counts = mirtop_folder_counts + '/.success' if os.path.isfile(filename_stamp_counts): stamp = functions.time_functions.read_time_stamp(filename_stamp_counts) print( colored( "\tA previous command generated results on: %s [%s -- %s - counts]" % (stamp, name, 'miRTop'), 'yellow')) else: print('Creating isomiRs counts for sample %s' % name) ## if both succeeded cmd_stats = miRTop_exe + ' counts -o %s --gff %s --hairpin %s --gtf %s --sps %s 2>> %s' % ( mirtop_folder_counts, mirtop_folder_gff_file, hairpinFasta, miRNA_gff, species, logfile) code_miRTop_counts = functions.system_call_functions.system_call( cmd_stats) if code_miRTop_counts: functions.time_functions.print_time_stamp(filename_stamp_counts) else: return (False) ## miRTop export filename_stamp_export = mirtop_folder_export + '/.success' if os.path.isfile(filename_stamp_export): stamp = functions.time_functions.read_time_stamp(filename_stamp_export) print( colored( "\tA previous command generated results on: %s [%s -- %s - export]" % (stamp, name, 'miRTop'), 'yellow')) else: print('Creating isomiRs export information for sample %s' % name) ## if both succeeded cmd_export = miRTop_exe + ' export -o %s --hairpin %s --gtf %s --sps %s --format isomir %s 2> %s' % ( mirtop_folder_export, hairpinFasta, miRNA_gff, species, mirtop_folder_gff_file, logfile) code_miRTop_export = functions.system_call_functions.system_call( cmd_export) if code_miRTop_export: functions.time_functions.print_time_stamp(filename_stamp_export) else: return (False) ## return all success outdir_tsv = os.path.join(mirtop_folder_counts, 'mirtop.tsv') return (outdir_tsv)
def run_biotype(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_RNAbiotype): ## information for join reads RNAbiotype.help_info() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True aesthetics_functions.pipeline_header('XICRA') aesthetics_functions.boxymcboxface("RNA biotype analysis") print("--------- Starting Process ---------") time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## get files print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## Discard if joined reads: use trimmed single-end or paired-end pd_samples_retrieved = pd_samples_retrieved[ pd_samples_retrieved['ext'] != '_joined'] ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: files_functions.create_folder(outdir) ## for samples mapping_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "map", options.debug) ## debug message if (Debug): print(colored("**DEBUG: mapping_outdir_dict **", 'yellow')) print(mapping_outdir_dict) # time stamp start_time_partial = time_functions.timestamp(start_time_total) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ############################################## ## map Reads ############################################## start_time_partial = mapReads_module(options, pd_samples_retrieved, mapping_outdir_dict, options.debug, max_workers_int, threads_job, start_time_partial, outdir) ## debug message if (Debug): print(colored("**DEBUG: mapping_results **", 'yellow')) print(mapping_results) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) ## for samples biotype_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "biotype", options.debug) ## debug message if (Debug): print(colored("**DEBUG: biotype_outdir_dict **", 'yellow')) print(biotype_outdir_dict) ## get RNAbiotype information RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict, options.annotation, options.debug, max_workers_int, threads_job) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print( "\n+ Generating a report using MultiQC module for featureCount analysis." ) outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in biotype_outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") featureCount_report = files_functions.create_subfolder( "featureCount", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "featureCount", featureCount_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % featureCount_report) ### Summarizing RNA biotype information biotype_report = files_functions.create_subfolder( "biotype", outdir_report) single_files_biotype = files_functions.create_subfolder( "samples", biotype_report) ## results dict_files = {} for samples in biotype_outdir_dict: featurecount_file = os.path.join(biotype_outdir_dict[samples], 'featureCount.out.tsv') if files_functions.is_non_zero_file(featurecount_file): dict_files[samples] = featurecount_file ## copy pdf pdf_plot = main_functions.retrieve_matching_files( biotype_outdir_dict[samples], '.pdf', options.debug) if files_functions.is_non_zero_file(pdf_plot[0]): shutil.copy(pdf_plot[0], single_files_biotype) ## collapse all information all_data = RNAbiotype.generate_matrix(dict_files) ## print into excel/csv print('+ Table contains: ', len(all_data), ' entries\n') ## debugging messages if Debug: print("** DEBUG: all_data") print(all_data) ## set abs_csv_outfile to be in report folder ## copy or link files for each sample analyzed abs_csv_outfile = os.path.join(biotype_report, "summary.csv") all_data.to_csv(abs_csv_outfile) ## create plot: call R [TODO: implement in python] outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf") ## R scripts biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum', options.debug) rscript = set_config.get_exe("Rscript", options.debug) cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script, abs_csv_outfile, outfile_pdf) ## print("+ Create summary plot for all samples") callCode = system_call_functions.system_call(cmd_R_plot) print("\n*************** Finish *******************") start_time_partial = time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def mapReads_module(options, pd_samples_retrieved, outdir_dict, Debug, max_workers_int, threads_job, start_time_partial, outdir): # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["new_name"]) ## options STAR_exe = set_config.get_exe("STAR", Debug=Debug) cwd_folder = os.path.abspath("./") folder = files_functions.create_subfolder('STAR_files', cwd_folder) ## For many samples it will have to load genome index in memory every time. ## For a unique sample it will not matter. Take care genome might stay in memory. ## Use before loop option LoadAndExit and then: ## in loop ## Use option LoadAndKeep, set shared memory > 30 Gb ## when finished loop Remove memory ## check reference if (options.fasta): print("+ Genome fasta file provided") print("+ Create genomeDir for later usage...") options.fasta = os.path.abspath(options.fasta) ## create genomeDir options.genomeDir = mapReads.create_genomeDir(folder, STAR_exe, options.threads, options.fasta, options.limitRAM) elif (options.genomeDir): print("+ genomeDir provided.") options.genomeDir = os.path.abspath(options.genomeDir) ## remove previous reference genome from memory print("+ Remove genome in memory from previous call... (if any)") mapReads.remove_Genome(STAR_exe, options.genomeDir, folder, options.threads) ## load reference genome mapReads.load_Genome(folder, STAR_exe, options.genomeDir, options.threads) ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) print("+ Mapping sequencing reads for each sample retrieved...") ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(mapReads_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, STAR_exe, options.genomeDir, options.limitRAM, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Mapping reads has finished...") ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) ## remove reference genome from memory mapReads.remove_Genome(STAR_exe, options.genomeDir, folder, options.threads) ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") map_report = files_functions.create_subfolder("STAR", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "STAR", map_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % map_report) return (start_time_partial)