def run_miRNA(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_miRNA): ## information for join reads help_XICRA.help_miRNA() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True functions.aesthetics_functions.pipeline_header('XICRA') functions.aesthetics_functions.boxymcboxface("miRNA analysis") print("--------- Starting Process ---------") functions.time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## user software selection print("+ Software for miRNA analysis selected:") print(options.soft_name) ## get files print('+ Getting files from input folder... ') if options.pair: options.pair = False ## set paired-end to false for further prepocessing if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: join.\n+ Extension: ') print("[_joined.fastq]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "join", ['_joined.fastq'], options.debug) else: if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: join.\n+ Extension: ') print("[_joined.fastq]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## Additional sRNAbench or miRTop options ## species print("+ Species provided:", options.species) ############################################################ ## miRNA information: hairpin, mature, str, gff3 ############################################################ if not (options.database): install_path = os.path.dirname(os.path.realpath(__file__)) options.database = os.path.join(install_path, "db_files") else: options.database = os.path.abspath(options.database) print("+ Create folder to store results: ", options.database) functions.files_functions.create_folder(options.database) ## miRNA_gff: can be set as automatic to download from miRBase if not options.miRNA_gff: print("+ File miRNA gff3 annotation") if Debug: print( colored("\t** ATTENTION: No miRNA gff file provided", 'yellow')) print(colored("\t** Download it form miRBase", 'green')) file_name = options.species + ".gff3" ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/genomes/" + file_name options.miRNA_gff = functions.main_functions.urllib_request( options.database, ftp_site, file_name, Debug) else: print("+ miRNA gff file provided") options.miRNA_gff = os.path.abspath(options.miRNA_gff) ## hairpin: can be set as automatic to download from miRBase if not options.hairpinFasta: print("+ File hairpin fasta") if Debug: print( colored("\t** ATTENTION: No hairpin fasta file provided", 'yellow')) print(colored("\t** Download it form miRBase", 'green')) ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz" options.hairpinFasta = functions.main_functions.urllib_request( options.database, ftp_site, "hairpin.fa.gz", Debug) else: print("+ hairpin fasta file provided") options.hairpinFasta = os.path.abspath(options.hairpinFasta) ## mature: can be set as automatic to download from miRBase if not options.matureFasta: print("+ File mature fasta") if Debug: print( colored("\t** ATTENTION: No mature miRNA fasta file provided", 'yellow')) print(colored("\t** Download it form miRBase", 'green')) ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz" options.matureFasta = functions.main_functions.urllib_request( options.database, ftp_site, "mature.fa.gz", Debug) else: print("+ mature fasta file provided") options.matureFasta = os.path.abspath(options.matureFasta) ## miRBase str: can be set as automatic to download from miRBase if not options.miRBase_str: print("+ File miRBase str annotation") if Debug: print( colored("\t** ATTENTION: No miRBase_str file provided", 'yellow')) print(colored("\t** Download it form miRBase", 'green')) ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.str.gz" options.miRBase_str = functions.main_functions.urllib_request( options.database, ftp_site, "miRNA.str.gz", Debug) ## extract else: print("+ miRBase_str file provided") options.miRBase_str = os.path.abspath(options.miRBase_str) ############################################################ ## generate output folder, if necessary if not options.project: print("\n+ Create output folder(s):") functions.files_functions.create_folder(outdir) ## for samples outdir_dict = functions.files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "miRNA", options.debug) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = functions.main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print("+ Create a miRNA analysis for each sample retrieved...") ## call miRNA_analysis: ## Get user software selection: sRNAbench, optimir, ... ## Standarize using miRTop ## dictionary results global results_df results_df = pd.DataFrame(columns=("name", "soft", "filename")) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["new_name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(miRNA_analysis, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, options.miRNA_gff, options.soft_name, options.matureFasta, options.hairpinFasta, options.miRBase_str, options.species, options.database, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ miRNA analysis is finished...") print("+ Let's summarize all results...") ## outdir outdir_report = functions.files_functions.create_subfolder( "report", outdir) expression_folder = functions.files_functions.create_subfolder( "miRNA", outdir_report) ## debugging messages if options.debug: print(results_df) ## merge all parse gtf files created print("+ Summarize miRNA analysis for all samples...") generate_DE.generate_DE(results_df, options.debug, expression_folder) print("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) print("\n+ Exiting miRNA module.") return ()
def run_join(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_join_reads): ## information for join reads help_XICRA.help_join_reads() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True functions.aesthetics_functions.pipeline_header('XICRA') functions.aesthetics_functions.boxymcboxface("Join paired-end reads") print("--------- Starting Process ---------") functions.time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## Percentage difference for joining sequences if not options.perc_diff: options.perc_diff = 0 print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim_'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: functions.files_functions.create_folder(outdir) ## for samples outdir_dict = functions.files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "join", options.debug) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = functions.main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print("+ Joining paired-end sequencing reads for each sample retrieved...") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["new_name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(fastqjoin_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, options.perc_diff, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Joining reads has finished...") ## TODO: create statistics on joined reads ## print("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def run_biotype(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_RNAbiotype): ## information for join reads RNAbiotype.help_info() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True aesthetics_functions.pipeline_header('XICRA') aesthetics_functions.boxymcboxface("RNA biotype analysis") print("--------- Starting Process ---------") time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## get files print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## Discard if joined reads: use trimmed single-end or paired-end pd_samples_retrieved = pd_samples_retrieved[ pd_samples_retrieved['ext'] != '_joined'] ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: files_functions.create_folder(outdir) ## for samples mapping_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "map", options.debug) ## debug message if (Debug): print(colored("**DEBUG: mapping_outdir_dict **", 'yellow')) print(mapping_outdir_dict) # time stamp start_time_partial = time_functions.timestamp(start_time_total) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ############################################## ## map Reads ############################################## start_time_partial = mapReads_module(options, pd_samples_retrieved, mapping_outdir_dict, options.debug, max_workers_int, threads_job, start_time_partial, outdir) ## debug message if (Debug): print(colored("**DEBUG: mapping_results **", 'yellow')) print(mapping_results) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) ## for samples biotype_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "biotype", options.debug) ## debug message if (Debug): print(colored("**DEBUG: biotype_outdir_dict **", 'yellow')) print(biotype_outdir_dict) ## get RNAbiotype information RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict, options.annotation, options.debug, max_workers_int, threads_job) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print( "\n+ Generating a report using MultiQC module for featureCount analysis." ) outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in biotype_outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") featureCount_report = files_functions.create_subfolder( "featureCount", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "featureCount", featureCount_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % featureCount_report) ### Summarizing RNA biotype information biotype_report = files_functions.create_subfolder( "biotype", outdir_report) single_files_biotype = files_functions.create_subfolder( "samples", biotype_report) ## results dict_files = {} for samples in biotype_outdir_dict: featurecount_file = os.path.join(biotype_outdir_dict[samples], 'featureCount.out.tsv') if files_functions.is_non_zero_file(featurecount_file): dict_files[samples] = featurecount_file ## copy pdf pdf_plot = main_functions.retrieve_matching_files( biotype_outdir_dict[samples], '.pdf', options.debug) if files_functions.is_non_zero_file(pdf_plot[0]): shutil.copy(pdf_plot[0], single_files_biotype) ## collapse all information all_data = RNAbiotype.generate_matrix(dict_files) ## print into excel/csv print('+ Table contains: ', len(all_data), ' entries\n') ## debugging messages if Debug: print("** DEBUG: all_data") print(all_data) ## set abs_csv_outfile to be in report folder ## copy or link files for each sample analyzed abs_csv_outfile = os.path.join(biotype_report, "summary.csv") all_data.to_csv(abs_csv_outfile) ## create plot: call R [TODO: implement in python] outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf") ## R scripts biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum', options.debug) rscript = set_config.get_exe("Rscript", options.debug) cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script, abs_csv_outfile, outfile_pdf) ## print("+ Create summary plot for all samples") callCode = system_call_functions.system_call(cmd_R_plot) print("\n*************** Finish *******************") start_time_partial = time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def run_prep(options): """ Main function of the prep module. This module prepares fastq files for later usage. It initially checks the length of the name and advises the user to rename samples if exceeded. This module allows to user to copy files into the project folder initiate or only link using a symbolic link to avoid duplicated raw data. """ ## help_format option if (options.help_format): help_XICRA.help_fastq_format() exit() functions.aesthetics_functions.pipeline_header('XICRA') functions.aesthetics_functions.boxymcboxface("Preparing samples") print("--------- Starting Process ---------") functions.time_functions.print_time() ## init time start_time_total = time.time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = os.path.abspath(options.output_folder) ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## output folder print("\n+ Create output folder(s):") functions.files_functions.create_folder(outdir) ## project if options.detached: options.project = False else: options.project = True ## default options if not options.include_lane: options.include_lane = False if not options.include_all: options.include_all = False ### info final_dir = "" if (options.project): print( "+ Generate a directory containing information within the project folder provided" ) final_dir = functions.files_functions.create_subfolder("info", outdir) else: final_dir = outdir ## get files print() functions.aesthetics_functions.print_sepLine("-", 50, False) print('+ Getting files from input folder... ') print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## Information returned in pd_samples_retrieved ### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz if options.debug: print(colored("** DEBUG: pd_samples_retrieved", 'yellow')) print(pd_samples_retrieved) #functions.print_all_pandaDF(pd_samples_retrieved) ## time stamp start_time_partial = functions.time_functions.timestamp(start_time_total) ## check character limitation list_lengths = pd_samples_retrieved.loc[:, 'name_len'].to_list() if any(i > 25 for i in list_lengths): print( colored( "\t ** Name lengths exceeds the 25 character limitation...", 'yellow')) if not (options.rename): print( colored("** ERROR: Rename files or provide --rename option...", 'red')) exit() ### rename files if (options.rename): options.rename = os.path.abspath(options.rename) if not functions.files_functions.is_non_zero_file(options.rename): print( colored( "** ERROR: File provided with rename information is not readable.", 'red')) print(options.rename) exit() names_retrieved = pd.read_csv( options.rename, sep=',', index_col=0, squeeze=True, header=None).to_dict() ## read csv to dictionary if (options.debug): print(colored('** DEBUG: names_retrieved', 'yellow')) print(names_retrieved) ## TODO: check integrity of new names and special characters ## print to a file timestamp = functions.time_functions.create_human_timestamp() rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt' rename_details_hd = open(rename_details, 'w') ## rename files for index, row in pd_samples_retrieved.iterrows(): if (row['gz']): extension_string = row['ext'] + row['gz'] else: extension_string = row['ext'] if options.single_end: renamed = names_retrieved[row['name']] + '.' + extension_string else: renamed = names_retrieved[row['name']] + '_' + row[ 'read_pair'] + '.' + extension_string ## modify frame pd_samples_retrieved.loc[index, 'new_file'] = renamed pd_samples_retrieved.loc[index, 'new_name'] = names_retrieved[row['name']] ## save in file string = row['sample'] + '\t' + renamed + '\n' rename_details_hd.write(string) if (options.debug): print(colored('** DEBUG: rename', 'yellow')) print("Original: ", row['name']) print("Renamed: ", names_retrieved[row['name']]) print("File:", renamed) rename_details_hd.close() ##elif (options.single_end): It should work for both print("+ Sample files have been renamed...") else: pd_samples_retrieved['new_file'] = pd_samples_retrieved['file'] ## create outdir for each sample outdir_dict = functions.files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "raw", options.debug) ## merge option if (options.merge_Reads): print("+ Sample files will be merged...") ## TODO: check when rename option provided pd_samples_merged = sampleParser.merge.one_file_per_sample( pd_samples_retrieved, outdir_dict, options.threads, final_dir, options.debug) if (options.rename): print("+ Merge files have been renamed...") else: print("+ Sample files have been merged...") ## process is finished here print("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp( start_time_total) print("+ Exiting prep module.") exit() ## debugging messages if (options.debug): print(colored("** DEBUG: pd_samples_retrieved", 'yellow')) #functions.print_all_pandaDF(pd_samples_retrieved) print(pd_samples_retrieved) print(colored("** DEBUG: outdir_dict", 'yellow')) print(outdir_dict) ## copy or create symbolic link for files if (options.copy_reads): print("+ Sample files will be copied...") ## print to a file timestamp = functions.time_functions.create_human_timestamp() copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt' copy_details_hd = open(copy_details, 'w') else: print("+ Sample files will be linked...") list_reads = [] for index, row in pd_samples_retrieved.iterrows(): if (options.copy_reads): ## TODO: debug & set threads to copy faster shutil.copy( row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'])) string = row['sample'] + '\t' + os.path.join( outdir_dict[row['new_name']], row['new_file']) + '\n' copy_details_hd.write(string) else: list_reads.append(row['new_file']) if options.project: functions.files_functions.get_symbolic_link_file( row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'])) if (options.copy_reads): print("+ Sample files have been copied...") copy_details_hd.close() else: if not options.project: functions.files_functions.get_symbolic_link(list_reads, outdir) print("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) print("+ Exiting prep module.") return ()
def run_trimm(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_trimm_adapters): ## help on trimm adapters help_XICRA.print_help_adapters() exit() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_multiqc): ## information for Multiqc help_XICRA.multiqc_help() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True functions.aesthetics_functions.pipeline_header('XICRA') functions.aesthetics_functions.boxymcboxface("Trimming samples") print("--------- Starting Process ---------") functions.time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir # Trimming adapters ## check adapters provided ## options.adapters_a ## options.adapters_A ## options.extra ## no adapters provided if (not options.adapters_a and not options.adapters_A and not options.extra): print( colored("** ERROR: No adapter trimming options provided...", 'red')) print("Please provide any option") exit() ## create dictionary with adapters_dict = {} if (options.adapters_a): adapters_dict['adapter_a'] = options.adapters_a if (options.adapters_a): adapters_dict['adapter_A'] = options.adapters_A ## get files print('+ Getting files from input folder... ') print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) print(colored("**DEBUG: adapters_dict **", 'yellow')) print(adapters_dict) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: functions.files_functions.create_folder(outdir) ## for samples outdir_dict = functions.files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "trimm", options.debug) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = functions.main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print("+ Trimming adapters for each sample retrieved...") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["new_name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(cutadapt_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, Debug, adapters_dict, options.extra): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Trimming samples has finished...") ## functions.time_functions.timestamp start_time_partial = functions.time_functions.timestamp(start_time_total) ## get files generated and generate symbolic link if not options.project: dir_symlinks = functions.files_functions.create_subfolder( 'link_files', outdir) files2symbolic = [] folders = os.listdir(outdir) ## debug message if (Debug): print( colored( "**DEBUG: generate symbolic links for each file in " + dir_symlinks + "**", 'yellow')) for fold in folders: if fold.endswith(".log"): continue else: this_folder = outdir + '/' + fold subfiles = os.listdir(this_folder) for files in subfiles: files_search = re.search( r".*trim_R\d{1}.*", files) ## only paired-end. Todo: single end if files_search: files2symbolic.append(this_folder + '/' + files) functions.files_functions.get_symbolic_link(files2symbolic, dir_symlinks) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = functions.files_functions.create_subfolder( "report", outdir) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") trimm_report = functions.files_functions.create_subfolder( "trimm", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "Cutadapt", trimm_report, "") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % trimm_report) print("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) print("\n+ Exiting trimm module.") exit()