def check_db_indexed(folder, option): """Check if ARIBA_ database is indexed. In the given folder it looks for '00.info.txt' file. :param folder: Absolute path to database folder. :param option: Whether to print more information messages or not [Yes/No]. :type folder: string :type option: string :returns: Boolean True/False. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.time_functions.read_time_stamp` .. include:: ../../links.inc """ path_basename = folder.split('/') if os.path.isfile(folder + '00.info.txt'): if os.path.isfile(folder + '.success'): stamp = HCGB_time.read_time_stamp(folder + '.success') print (colored("\tA previous command generated results on: %s [%s]" %(stamp, path_basename[-2]), 'yellow')) return True else: if (option == 'YES'): print (colored("\t- ARIBA database: " + path_basename + " [ ERROR ]", 'red')) return False
def annot_caller(seq_file, sample_folder, options, name, threads): ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) else: ## debug message if (Debug): print(colored("**DEBUG: annotation.module_call call**", 'yellow')) print( " annotation.module_call (seq_file, options.kingdom, options.genera, sample_folder, name, threads)" ) print(" annotation.module_call " + seq_file + "\t" + options.kingdom + "\t" + options.genera + "\t" + sample_folder + "\t" + name + "\t" + str(threads)) # Call annotation annotation.module_call(seq_file, options.kingdom, options.genera, sample_folder, name, threads)
def get_database(db_frame, Debug): data4db = pd.DataFrame() for index, row in db_frame.iterrows(): ## information this_file = db_frame.loc[index]['path'] + '/info.txt' if os.path.isfile(this_file): print('+ Reading information for sample: ', db_frame.loc[index]['db']) print( colored("\t+ Obtaining information from file: %s" % this_file, 'yellow')) this_db = HCGB_main.get_data(this_file, ',', 'index_col=0') data4db = data4db.append(this_db) timestamp = db_frame.loc[index]['path'] + '/.success' if os.path.isfile(timestamp): stamp = HCGB_time.read_time_stamp(timestamp) print(colored("\t+ Data generated on: %s" % stamp, 'yellow')) HCGB_aes.print_sepLine("*", 25, False) ## index by ID if not data4db.empty: data4db = data4db.set_index('ID') return (data4db)
def ariba_run_caller(db2use, db_name, list_files, folder_out, threads, cutoff): ## check if already is done # generate a stamp when finish parsing each file ## make stamp time filename_stamp = os.path.join(folder_out, '.success_' + db_name) if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) files_names = [os.path.basename(s) for s in list_files] print( colored( "\tA previous command generated results on: %s [Files: %s]" % (stamp, files_names), 'yellow')) else: if os.path.exists(folder_out): shutil.rmtree( folder_out) ## delete folder if exists but failed before ## call code = ariba_caller.ariba_run(db2use, list_files, folder_out, threads, cutoff) if code == 'FAIL': print("*** ERROR: System call failed for ", folder_out) ## print success timestamp HCGB_time.print_time_stamp(filename_stamp)
def module_call(sequence_fasta, kingdom, genus, path, name, threads): """ Function that checks and generates annotation. - It uses Prokka_ via :func:`BacterialTyper.scripts.annotation.prokka_call`. - It checks if previously generated - Once finished, it prints timestamp :param sequence_fasta: Assembled sequences in fasta file format. :param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses :param genus: Available genus options for Prokka software. See details above. :param path: Absolute path to the output folder to include results. :param name: Sample name and tag to include in the annotation report and files. :param threads: Number of CPUs to use. :type sequence_fasta: string :type kingdom: string :type genus: string :type path: string :type name: string :type threads: integer .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`HCGB.functions.time_functions.print_time_stamp` - :func:`HCGB.functions.time_functions.prokka_call` .. include:: ../../links.inc """ ## check if previously assembled and succeeded filename_stamp = path + '/.success' if os.path.isdir(path): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) return () ## call prokka prokka_bin = set_config.get_exe('prokka') dirname = prokka_call(prokka_bin, sequence_fasta, kingdom, genus, path, name, threads) ## success stamps filename_stamp = path + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) return (dirname)
def SPADES_systemCall(sample_folder, file1, file2, name, SPADES_bin, options, threads, debug=False): """Generate SPADES system call. It calls system for SPADES and generates time stamp file in the folder provided (sample_folder + '/.success_assembly') for later analysis. Steps: - It generates system call for SPADES assembly. - It generates timestamp file. :param sample_folder: Absolute path to store results. It must exists. :param file1: Absolute path to fastq reads (R1). :param file2: Absolute path to fastq reads (R2). :param name: Sample name or tag to identify sample. :param SPADES_bin: Binary executable for SPADES assembly software. :param options: Plasmid assembly is possible if specificed via options (--plasmid). :param threads: Number of CPUs to use. :type name: string :type sample_folder: string :type file1: string :type file2: string :type SPADES_bin: string :type options: string :type threads: integer :return: Returns **OK** if assembly process succeeded and fasta file is generated. :rtype: string. :warnings: Returns **FAIL** if assembly process stopped. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.main_functions.system_call` - :func:`HCGB.functions.time_functions.print_time_stamp` """ ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success_assembly' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) return('OK') ## call system for SPADES sample given logFile = sample_folder + '/' + name + '.log' ## command cmd_SPADES = '%s %s-t %s -o %s -1 %s -2 %s > %s 2> %s' %(SPADES_bin, options, threads, sample_folder, file1, file2, logFile, logFile) code = HCGB_sys.system_call(cmd_SPADES) if (code == 'OK'): ## success stamps filename_stamp = sample_folder + '/.success_assembly' stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') return "FAIL"
def check_results(db2use, outdir_sample, assembly_cutoff, card_trick_info): """ .. seealso:: Additional information to ARIBA results generated. - :ref:`ARIBA-explained` """ ## ## outdir_sample is a dataframe containing information of the output folder generated by ariba. ## It is index for each database and for each sample. ## This function iterates for each sample and generates call to specific function to parse results. ## ## iterate multi-index dataframe dataFrame_results = pd.DataFrame(columns=("csv", "excel", "database")) for sample, data in outdir_sample.groupby(level='sample'): for database, data2 in data.groupby(level='db'): if (database != db2use): continue folderResults = data2.loc[sample, db2use]['output'] outfolder = data2.loc[sample, db2use]['dirname'] if db2use == 'card': database = 'card' name_db = 'CARD' elif db2use == 'vfdb_full': database = 'vfdb_full' name_db = 'VFDB' else: database = 'other' name_db = 'other' ## might generate conflicts if several other databases provided ## TODO: check filename_stamp = outfolder + '/.success_' + database if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, sample), 'yellow')) name_excel = outfolder + '/' + sample + '_' + name_db + '_results.xlsx' name_csv = outfolder + '/' + sample + '_' + name_db + '_summary.csv' else: (name_excel, name_csv) = results_parser(database, folderResults, sample, outfolder, assembly_cutoff, card_trick_info) dataFrame_results.loc[sample] = (name_csv, name_excel, name_db ) ## to return return (dataFrame_results)
def init_db_object(debug): """Instantiate the ete taxonomy object Created by Joe R. J. Healey; Nick Youngblut Original code. """ # Instantiate the ete NCBI taxa object print("+ ------------------------------------- +") print("+ Looking for NCBI taxonomy database:") ncbi = NCBITaxa() ## dbfile location if debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") debug_message( 'NCBI Taxonomy database is stored at {}\n'.format(ncbi.dbfile), "yellow") ## folder would be download here: ~/.etetoolkit/taxa.sqlite db_folder = os.path.dirname(format(ncbi.dbfile)) ## check timestamp, update if necessary filename_stamp_parse = db_folder + '/timestamp_db.txt' if os.path.isfile(filename_stamp_parse): stamp = time_functions.read_time_stamp(filename_stamp_parse) days_passed = time_functions.get_diff_time(filename_stamp_parse) ## debug messages if debug: debug_message('Database previously initiated', "yellow") debug_message('on date: {}'.format(stamp), "yellow") debug_message('Days passed: {}'.format(days_passed), "yellow") if (days_passed > 30): ## update_db update_db(ncbi, db_folder, debug) else: ## debug messages if debug: debug_message('No need to update db', "yellow") debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, 'init database'), 'yellow')) else: ## create first timestamp time_functions.print_time_stamp(filename_stamp_parse) return ncbi
def trimmo_caller(list_reads, sample_folder, name, threads, Debug, adapters): ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) else: # Call trimmomatic trimmomatic_call.trimmo_module(list_reads, sample_folder, name, threads, Debug, adapters)
def create_blast_results(sample, fasta_file, outdir, debug): '''Creates BLAST results for each fasta vs. itself''' #phr is the header file, pin is the index file, psq is the sequence file ## debug messages if debug: debug_message('create_blast_results function call:', 'yellow') debug_message('sample: ' + sample, 'yellow') debug_message('fasta_file: ' + fasta_file, 'yellow') debug_message('outdir: ' + outdir, 'yellow') ## output file raw_blast = os.path.abspath(os.path.join(outdir, "BLAST_raw_results.tsv")) ## timestamps db_timestamp = os.path.join(outdir, '.db_success') search_timestamp = os.path.join(outdir, '.blast_success') if (not HCGB.functions.files_functions.is_non_zero_file(search_timestamp)): ## get binaries (makeblastdb_exe, blastp_exe) = BacDup.modules.config.get_exe('BLAST', debug) makeblastdb_exe = "/usr/bin/makeblastdb" blastp_exe = "/usr/bin/blastp" ## check if db is indexed already db_path_name = os.path.join(os.path.abspath(outdir), sample + '_db') if (not HCGB.functions.files_functions.is_non_zero_file(db_timestamp)): ## generate blastdb for genome HCGB.functions.blast_functions.makeblastdb(db_path_name, fasta_file, makeblastdb_exe, 'prot') # HCGB function ## print time stamp HCGB_time.print_time_stamp(db_timestamp) else: print (colored("\t+ BLAST database already available for sample %s [%s]" %(sample, read_time), 'green')) ## create blastp outfile HCGB.functions.blast_functions.blastp(blastp_exe, raw_blast, db_path_name, fasta_file, 1) # HCGB function ## print time stamp HCGB_time.print_time_stamp(search_timestamp) else: read_time = HCGB_time.read_time_stamp(search_timestamp) print (colored("\t+ Duplicate search already available for sample %s [%s]" %(sample, read_time), 'green')) return (raw_blast)
def download_VFDB_files(folder): ## ## Given a folder, check if it contains VFDB information ## or download it from website: http://www.mgc.ac.cn ## links = ( "http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz", "http://www.mgc.ac.cn/VFs/Down/Comparative_tables_from_VFDB.tar.gz") ## check if data is downloaded, how old is the data and if it is necessary to download again ## consider >30 days long enough to be updated again ## time stamp filename_stamp = folder + '/download_timestamp.txt' if os.path.exists(folder): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print("+ A previous download generated results on: ", stamp) days_passed = HCGB_time.get_diff_time(filename_stamp) print("\t\t** %s days ago" % days_passed) if (days_passed > 30): ## download again print( "\t\t** Downloading information again just to be sure...") else: print("\t\t** No need to download data again.") return () else: HCGB_files.create_folder(folder) ## Open file and readlines print('+ Downloading files:\n') for line in links: if not line.startswith('#'): HCGB_sys.wget_download(line, folder) ## decompress files print('+ Decompressing gzip files\n') files = os.listdir(folder) for item in files: #print (folder) if item.endswith('.gz'): HCGB_files.extract(folder + '/' + item, folder) ## make stamp time HCGB_time.print_time_stamp(filename_stamp) return ()
def prepare_card_data(database_folder): ## create CARD folder abs_folder = os.path.abspath(database_folder) CARD_folder = HCGB_files.create_subfolder('CARD', abs_folder) ## make stamp time filename_stamp = CARD_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [CARD Ontology Data]" %stamp, 'yellow')) ## check time passed days_passed = HCGB_time.get_diff_time(filename_stamp) print ("\t** %s days ago" %days_passed) if (days_passed > 30): ## download again print ("\t ** Downloading information again just to be sure...") download=True else: print ("\t ** No need to download data again.") download=False else: download=True ### if download: ## uptade database in a path aro_obo_file = card_trick.ontology_functions.update_ontology(CARD_folder, False) ## get ontology and save it in csv return_frame = card_trick.ontology_functions.parse_ontology(aro_obo_file, False) ### if success return folder name if not return_frame.empty: ## success stamps filename_stamp = CARD_folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) else: return (FAIL) ## return folder name return(CARD_folder)
def snippy_variant_caller(reference, files, threads, outdir, name, contig_option, other_options, sample_name, Debug): ## create subfolder within phylo for this mapping tag = sample_name + '_vs_' + name subdir = HCGB_files.create_subfolder(tag, outdir) ## check if previously process and succeeded filename_stamp = subdir + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, tag), 'yellow')) else: # Call variant calling code = variant_calling.snippy_call(reference, files, threads, subdir, sample_name, contig_option, other_options, Debug) if code == 'OK': stamp = HCGB_time.print_time_stamp(filename_stamp) return(code)
def agrvate_caller(dict_assemblies, dict_folders, debug=False): """Create agrvate call and control for parameters""" ## ATTENTION: agrvate needs to chdir to output folder path_here = os.getcwd() print ("+ Checking agr genes for each sample retrieved...") agrvate_results = pd.DataFrame() ## No need to optimize. There is a problem with the working dir of agrvate and we ## need to change every time. for name, assembly_file in dict_assemblies.items(): sample_folder = HCGB_files.create_folder(dict_folders[name]) ## check if previously done and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) else: os.chdir(sample_folder) info_sample = agrvate_call(name, assembly_file, sample_folder, debug) agrvate_results = pd.concat([agrvate_results, info_sample], join='outer') if (info_sample.shape[0] == 0): print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name) else: ## success HCGB_time.print_time_stamp(filename_stamp) print ("+ Jobs finished%s\n+ Collecting information for all samples...") os.chdir(path_here) ## debug messages if debug: HCGB_aes.debug_message('agrvate_results', 'yellow') HCGB_main.print_all_pandaDF(agrvate_results) return(agrvate_results)
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db): my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name) ## timestamp filename_stamp = my_out_folder + '/.success' print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow')) ## check previous run if os.path.isfile(filename_stamp): timestamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tSuccessfully run on date: %s" %timestamp, 'green')) else: busco_bin = set_config.get_exe('busco') os.chdir(output_name) ## init cmd configuration cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db) ## options if autolineage or given dataset if "auto-lineage" == dataset_name: logFile = 'auto_lineage.log' cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile) else: logFile = dataset_name + '.log' cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile) ## system call HCGB_sys.system_call(cmd) if os.path.isfile(my_out_folder + '/short_summary.txt'): ## timestamp HCGB_time.print_time_stamp(filename_stamp) else: print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red')) return ('FAIL') return()
def mapReads_caller(files, folder, name, threads, STAR_exe, genomeDir, limitRAM_option, Debug): ## check if previously joined and succeeded filename_stamp = folder + '/.success' if os.path.isfile(filename_stamp): stamp = time_functions.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s -- %s]" % (stamp, name, 'STAR'), 'yellow')) else: ## if Debug: print("\n** DEBUG: mapReads_caller options **\n") print("folder: " + folder) print("name: " + name) print("threads: " + str(threads)) print("STAR_exe: " + STAR_exe) print("genomeDir: " + genomeDir) print("limitRAM_option: " + str(limitRAM_option)) print("files: ") print(files) # Call STAR code_returned = mapReads.mapReads("LoadAndKeep", files, folder, name, STAR_exe, genomeDir, limitRAM_option, threads, Debug) if (code_returned): time_functions.print_time_stamp(filename_stamp) else: print("+ Mapping sample %s failed..." % name) ## return results bam_file = os.path.join(folder, 'Aligned.sortedByCoord.out.bam') mapping_results[name] = bam_file return ()
def pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug): ## filename_stamp_plot = folder + '/.success_plot' if os.path.isfile(filename_stamp_plot): stamp = time_functions.read_time_stamp(filename_stamp_plot) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'plot results'), 'yellow')) else: # PLOT and SHOW results RNAbiotypes_stats = main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None') # create plot plt.figure(figsize=(16,8)) df_genetype_2 = pd.DataFrame({'Type':RNAbiotypes_stats[0], 'Count':RNAbiotypes_stats[1]}).sort_values(by=['Count']) ## get total count df_genetype_ReadCount_sum = df_genetype_2['Count'].sum() ## filter 1% values minimun = df_genetype_ReadCount_sum * 0.01 df_genetype_filter_greater = df_genetype_2[ df_genetype_2['Count'] >= minimun ] df_genetype_filter_smaller = df_genetype_2[ df_genetype_2['Count'] < minimun ] ## create %values df_genetype_2['Percentage'] = (df_genetype_2['Count']/df_genetype_ReadCount_sum*100).round(3) ## merge and generate Other class df_genetype_filter_smaller_sum = df_genetype_filter_smaller['Count'].sum() ## total filter smaller df_genetype_filter_greater2 = df_genetype_filter_greater.append({ 'Count':df_genetype_filter_smaller_sum, 'Type':'Other'}, ignore_index=True) ## Create Pie Plot ax1 = plt.subplot(121, aspect='equal') df_genetype_filter_greater2.plot.pie( y = 'Count', ax=ax1, autopct='%1.2f%%', shadow=False, labels=df_genetype_filter_greater2['Type'], legend = False) # plot table ax2 = plt.subplot(122) plt.axis('off') tbl = ax2.table( cellText=df_genetype_2.values, colLabels=df_genetype_2.columns, loc='center', rowLoc='left', cellLoc='center', ) tbl.auto_set_font_size(True) #tbl.set_fontsize(12) tbl.scale(1.1,1.1) ## set PDF name name_figure = os.path.join(folder, name + '_RNAbiotypes.pdf') ## generate image plt.savefig(name_figure) plt.close(name_figure) ## print time stamps time_functions.print_time_stamp(filename_stamp_plot) filename_stamp_all = folder + '/.success_all' time_functions.print_time_stamp(filename_stamp_all)
def parse_featureCount(out_file, path, name, bam_file, Debug): """ Parses featureCount results for RNAbiotype analysis. :param out_file: Name provided to featureCount for output results. :param path: :param name: """ ## file names out_tsv_file_name = out_file + '.tsv' RNA_biotypes_file_name = os.path.join(path, name + '_RNAbiotype.tsv') ## filename_stamp_parse = path + '/.success_parse' if os.path.isfile(filename_stamp_parse): stamp = time_functions.read_time_stamp(filename_stamp_parse) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'parse results'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("Parse results for sample: " + name) ## parse results out_tsv_file = open(out_tsv_file_name, 'w') RNA_biotypes_file = open(RNA_biotypes_file_name, 'w') tRNA_count = 0 ########################################## ### read count file ########################################## count_file = open(out_file) count_file_text = count_file.read() count_file_lines = count_file_text.splitlines() for line in count_file_lines: if line.startswith('#'): continue elif line.startswith('Geneid'): continue else: ID = line.split('\t')[0] count = int(line.split('\t')[-1]) string2write_raw = "%s\t%s\n" %(ID, count) out_tsv_file.write(string2write_raw) tRNA_search = re.search(r".*tRNA", ID) if tRNA_search: tRNA_count = int(tRNA_count) + int(count) elif (count > 0): RNA_biotypes_file.write(string2write_raw) ## count and summary tRNA string2write = "tRNA\t%s\n" %tRNA_count RNA_biotypes_file.write(string2write) RNA_biotypes_file.close() ########################################## ### read summary count file ########################################## summary_count_file = open(out_file + '.summary') summary_count_file_text = summary_count_file.read() summary_count_file_lines = summary_count_file_text.splitlines() for line in summary_count_file_lines: if line.startswith('Status'): continue elif line.startswith('Assigned'): continue else: ## adds Unassigned_Ambiguity ## adds Unassigned_NoFeatures ID = line.split('\t')[0] count = int(line.split('\t')[-1]) ## skip empty entries if count == 0: continue string2write_raw = "%s\t%s\n" %(ID, count) out_tsv_file.write(string2write_raw) ########################################## ## get mapping statistics according to mapping software ########################################## count_multi = 0 count_unmap = 0 mapping_folder = os.path.dirname(bam_file) mapping_stats = mapping_folder + '/Log.final.out' ## -------------------------------- ## ### STAR mapping ## -------------------------------- ## if files_functions.is_non_zero_file(mapping_stats): ## debugging messages if Debug: print ("** DEBUG:") print ("STAR mapping available for sample: " + name) print ("mapping_folder: " + mapping_folder) mapping_stats_file = open(mapping_stats) mapping_stats_file_text = mapping_stats_file.read() mapping_stats_file_lines = mapping_stats_file_text.splitlines() for line in mapping_stats_file_lines: multi_search = re.search(r".*Number of reads mapped to", line) unmap_search = re.search(r".*unmapped.*", line) input_search = re.search(r".*input reads.*", line) if input_search: total_input_reads = int(line.split('\t')[-1]) if multi_search: count_tmp = int(line.split('\t')[-1]) count_multi = count_multi + count_tmp elif unmap_search: perc_tmp = line.split('\t')[-1] count_reads = math_functions.percentage(perc_tmp, total_input_reads) count_unmap = count_unmap + count_reads else: ## -------------------------------- ## ## tophat ## -------------------------------- ## mapping_stats = mapping_folder + '/align_summary.txt' count_map = 0 total_input_reads = 0 if files_functions.is_non_zero_file(mapping_stats): ## debugging messages if Debug: print ("** DEBUG:") print ("tophat mapping available for sample: " + name) print ("mapping_folder: " + mapping_folder) mapping_stats_file = open(mapping_stats) mapping_stats_file_text = mapping_stats_file.read() mapping_stats_file_lines = mapping_stats_file_text.splitlines() for line in mapping_stats_file_lines: map_search2 = re.search(r"Aligned.*\:\s+(\d+).*", line) input_search2 = re.search(r".*Input.*\:\s+(\d+).*", line) if input_search2: total_input_reads = input_search2.group(1) if map_search2: count_map = map_search2.group(1) #### count_unmap = int(total_input_reads) - int(count_map) else: ## other print ("Neither tophat or STAR..., no mapping statistics") ### print mapping stats string2write_unmap = "unmapped\t%s\n" %count_unmap out_tsv_file.write(string2write_unmap) ## close files out_tsv_file.close() ## print timestamp time_functions.print_time_stamp(filename_stamp_parse) return(out_tsv_file_name, RNA_biotypes_file_name)
def check_sample_assembly(name, sample_folder, files, threads): """Checks if sample is assembled. It checks whether a sample is assembled or not by reading file *sample_folder/.success_all*. If file not available (no previous assembly or not suceeded it) it calls :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` to generate assembly for the sample speficied. :param name: Sample name or tag to identify sample. :param sample_folder: directory to generate assembly ouptut. It must exist. :param files: List containing files (fastq R1 & R2) for the sample to be assembled. :param threads: Number of CPUs to use :type name: string :type sample_folder: string :type files: list :type threads: integer :return: Populates dictionary assembly_stats with assembly stats dictionary information :rtype: Dataframe .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` """ ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success_all' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) ## Get information stat_output = { 'Contig Stats': HCGB_main.file2dictionary( sample_folder + '/' + name + '_assembly-contigs.csv', ','), 'Scaffold Stats': HCGB_main.file2dictionary( sample_folder + '/' + name + '_assembly-scaffolds.csv', ',') } ## populate main dictionary assembly_stats[name] = [ stat_output, sample_folder + '/' + name + '_assembly_stats.xlsx' ] else: ## debug message if (Debug): HCGB_aes.debug_message( "spades_assembler.run_module_assembly call:", "yellow") print("spades_assembler.run_module_assembly " + name + "\t" + sample_folder + "\t" + files[0] + "\t" + files[1] + "\t" + str(threads) + "\n") # Call spades_assembler code = spades_assembler.run_module_assembly(name, sample_folder, files[0], files[1], threads) if (code != 'FAIL'): ## success stamps filename_stamp = sample_folder + '/.success_all' stamp = HCGB_time.print_time_stamp(filename_stamp) assembly_stats[ name] = code # list containing dictionary of data and excel else: print( "Some error occurred for sample %s while generating the assembly. " % name)
def ariba_getref(database, outdir, Debug, threads): ###################################################################################### ## usage: ariba getref [options] <db> <outprefix> ###################################################################################### ## Download reference data from one of a few supported public resources ## positional arguments: ## DB name Database to download. Must be one of: argannot card megares plasmidfinder resfinder srst2_argannot vfdb_core vfdb_full virulencefinder ## outprefix Prefix of output filenames ###################################################################################### ## where database is one of: ## argannot, card, megares, plasmidfinder, resfinder, ## srst2_argannot, vfdb_core, vfdb_full, virulencefinder. ## folders outdir_name = outdir + '/' + database outdir_prepare_ref = outdir + '_prepareref' ## download information in database folder provided by config print ("\t+ Retrieve information from database: " + database) ## check if previously downloaded and succeeded filename_stamp = outdir + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) download_ariba_cmd = 'OK' else: cmd_getref = 'ariba getref %s %s' %(database, outdir_name) download_ariba_cmd = HCGB_sys.system_call(cmd_getref) if (download_ariba_cmd == 'OK'): stamp = HCGB_time.print_time_stamp(filename_stamp) ## debug message if (Debug): print (colored("**DEBUG: ariba getref %s succeed " %database + "**", 'yellow')) else: ## rise error & exit print (colored("***ERROR: ariba getref %s failed " %database + " **",'red')) return('FAIL') ## debug message if (Debug): print (colored("**DEBUG: Run ariba prepareref %s " %database + "**", 'yellow')) ## check if previously prepareref and succeeded filename_stamp_prepare = outdir_prepare_ref + '/.success' if os.path.isfile(filename_stamp_prepare): stamp = HCGB_time.read_time_stamp(filename_stamp_prepare) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) else: ## get information list_files = os.listdir(outdir) fasta = "" metadata = "" for f in list_files: if f.endswith('tsv'): metadata = outdir + '/' + f elif f.endswith('fa'): fasta = outdir + '/' + f code = ariba_prepareref(fasta, metadata, outdir_prepare_ref, threads) if (code == 'OK'): filename_stamp = outdir_prepare_ref + '/.success' HCGB_time.print_time_stamp(filename_stamp_prepare) return()
def download_ariba_databases(list_dbs, main_folder, Debug, threads): """Download ARIBA_ databases. Using ARIBA software this function retrieves desired databases and prepare them for later analysis. :param list_dbs: List of databases to download. :param main_folder: Absolute path to database folder. :param Debug: True/false for printing developer messages :param threads: Number of CPUs to use. :type list_dbs: string :type main_folder: string :type Debug: Boolean :type threads: integer .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.file_functions.create_subfolder` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`BacterialTyper.scripts.ariba_caller.get_ARIBA_dbs` - :func:`BacterialTyper.scripts.ariba_caller.ariba_getref` .. include:: ../../links.inc """ print("\n\n+ Download databases for Antimicrobial Resistance Identification By Assembly (ARIBA).") ariba_folder = HCGB_files.create_subfolder("ARIBA", main_folder) ## print ARIBA databases: print ("+ Available databases:") dbs = get_ARIBA_dbs(list_dbs) for db_set in dbs: HCGB_aes.print_sepLine("-",30, False) print (colored("+ " + db_set,'yellow')) ## prepare folders folder_set = HCGB_files.create_subfolder(db_set, ariba_folder) outdir_prepare_ref = folder_set + '_prepareref' ## stamp time file filename_stamp_prepare = outdir_prepare_ref + '/.success' ## check if previously done if os.path.isfile(filename_stamp_prepare): stamp = HCGB_time.read_time_stamp(filename_stamp_prepare) print ("\t+ Database is downloaded in folder: ", folder_set) print ("\t+ Data is available and indexed in folder: ", outdir_prepare_ref) print (colored("\tDatabase was previously downloaded and prepared on: %s" %stamp, 'yellow')) ## Check if necessary to download again after several months/days days_passed = HCGB_time.get_diff_time(filename_stamp_prepare) print ("\t\t** %s days ago" %days_passed) if (days_passed > 30): ## download again print ("\t\t** Downloading information again just to be sure...") return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads) else: return_ariba_getref = 'OK' else: return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads) if (return_ariba_getref == 'OK'): print() else: print (colored("** ARIBA getref failed or generated a warning for " + db_set, 'red'))
def edirect_ident(dataFrame, outdir_dict, Debug): """Connect to NCBI for information retrieval This functions uses the software edirect_ to connect to NCBI and retrieve some information regarding samples, assemblies, publications, etc. :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :return: Information of the identification :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/edirect_download_results.csv` here: .. include:: ../../devel/results/edirect_download_results.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.get_info_file` - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.print_time_stamp` - :func:`BacterialTyper.scripts.functions.optimize_threads` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.functions.is_non_zero_file` - :func:`BacterialTyper.scripts.edirect_caller.generate_docsum_call` - :func:`BacterialTyper.scripts.edirect_caller.generate_xtract_call` .. include:: ../../links.inc """ ################################################ ## TODO: What to do if multi-isolate sample? ################################################ ## edirect HCGB_aes.boxymcboxface("EDirect information") print("+ Connect to NCBI to get information from samples identified...") ## create dataframe to return results edirect_frame = pd.DataFrame(columns=("sample", "genus", "species", "strain", "BioSample", "genome", "Plasmids")) ## debugging messages if Debug: print("*******************************************************") print("Dataframe sample_results: ") # Group dataframe sample name sample_results = dataFrame.groupby(["Sample"]) for name, grouped in sample_results: ## debugging messages if Debug: print("Name: ", name) print(grouped) ## use edirect to get Species_name and entry for later identification edirect_folder = HCGB_files.create_subfolder('edirect', outdir_dict[name]) ## chromosome match if (len(grouped.loc[grouped['Database'] == 'bacteria.ATG'] ['#Template']) == 0): if Debug: print("Name: ", name) print("No chromosome match identified by kmer") genus = '' species = '' BioSample_name = '' AssemblyAcc = '' else: nucc_entry = grouped.loc[grouped['Database'] == 'bacteria.ATG'][ '#Template'].values[0].split() ## e.g. NZ_CP029680.1 Staphylococcus aureus strain AR_0215 chromosome, complete genome ## out_docsum_file = edirect_folder + '/nuccore_docsum.txt' tmp_species_outfile = edirect_folder + '/info.csv' filename_stamp = edirect_folder + '/.success_species' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) status = True else: edirect_caller.generate_docsum_call('nuccore', nucc_entry[0], out_docsum_file) status = edirect_caller.generate_xtract_call( out_docsum_file, 'DocumentSummary', 'Organism,BioSample,AssemblyAcc,Strain', tmp_species_outfile) ######################################## ## get information from edirect call ######################################## if not status: print("NO INFORMATION") continue taxa_name_tmp = HCGB_main.get_info_file(tmp_species_outfile) Organism = taxa_name_tmp[0].split(',')[0].split() genus = Organism[0] ## genus species = Organism[1] ## species BioSample_name = taxa_name_tmp[0].split(',')[1] ## BioSample AssemblyAcc = taxa_name_tmp[0].split(',')[2] ## AssemblyAcc ## sometimes strain is missing if len(taxa_name_tmp[0].split(',')) > 3: strain = taxa_name_tmp[0].split(',')[3] ## strain else: strain = 'NaN' ## get GenBank accession ID out_docsum_file_assembly = edirect_folder + '/assembly_docsum.txt' AssemblyAcc_outfile = edirect_folder + '/AssemblyAcc.csv' edirect_caller.generate_docsum_call('assembly', AssemblyAcc, out_docsum_file_assembly) edirect_caller.generate_xtract_call(out_docsum_file_assembly, 'DocumentSummary', 'Genbank', AssemblyAcc_outfile) ## some error occurred if not HCGB_main.is_non_zero_file(out_docsum_file_assembly): continue ## Is it better to download Refseq or Genbank? ## https://www.quora.com/What-is-the-difference-between-Refseq-and-Genbank GenbankAcc = HCGB_main.get_info_file(AssemblyAcc_outfile) if Debug: print("Sample: ", name) print("Genbank Acc: ", GenbankAcc[0]) ## plasmid match group_plasmid = grouped.loc[grouped['Database'] == 'plasmids.T'] plasmid_entries = group_plasmid['#Template'].tolist() ## e.g. NZ_CP029083.1 Staphylococcus aureus strain AR464 plasmid unnamed1, complete sequence plasmid_entries_str = ",".join([i.split()[0] for i in plasmid_entries]) ## save edirect_frame #("sample", "taxa", strain, genome "BioSample", "Plasmids")) edirect_frame.loc[len(edirect_frame)] = (name, genus, species, strain, BioSample_name, GenbankAcc[0], plasmid_entries_str) stamp = HCGB_time.print_time_stamp(filename_stamp) ## debugging messages if Debug: print("*******************************************************") return (edirect_frame)
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded): ## folder for results if not os.path.isdir(path): files_functions.create_folder(path) out_file = os.path.join(path, 'featureCount.out') logfile = os.path.join(path, name + '_RNAbiotype.log') filename_stamp_all = path + '/.success_all' if os.path.isfile(filename_stamp_all): stamp = time_functions.read_time_stamp(filename_stamp_all) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow')) return() else: filename_stamp_featureCounts = path + '/.success_featureCounts' if os.path.isfile(filename_stamp_featureCounts): stamp = time_functions.read_time_stamp(filename_stamp_featureCounts) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("featureCounts system call for sample: " + name) print ("out_file: " + out_file) print ("logfile: " + logfile) ## send command for feature count ## Allow multimapping if allow_multimap: cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) else: cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) ## system call cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True) if not cmd_featureCount_code: print("** ERROR: featureCount failed for sample " + name) exit() ## print time stamp time_functions.print_time_stamp(filename_stamp_featureCounts) ## parse results (extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug) ## debugging messages if Debug: print ("** DEBUG:") print ("extended_Stats: " + extended_Stats_file) print (main_functions.get_data(extended_Stats_file, '\t', 'header=None')) print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file) print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')) return ()
def run_annotation(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() ## set default options.batch = False ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly annotation") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### symbolic links print("+ Retrieve all genomes assembled...") ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot", options.debug) ## annotate print("+ Annotate assemblies using prokka:") print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode") if options.genera == 'Other': print( "\t-Option: genera = Off; No genus-specific BLAST databases option provided" ) else: print("\t-Option: genera = ", options.genera, "; Genus-specific BLAST databases option provided") print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature") print("\t-Option: addmrna; Add 'mRNA' features for each 'CDS' feature") print("\t-Option: cdsrnaolap; Allow [tr]RNA to overlap CDS") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(annot_caller, row['sample'], outdir_dict[row['name']], options, row['name'], threads_job): index for index, row in pd_samples_retrieved.iterrows() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get folders givenList = [v for v in outdir_dict.values()] protein_files = [] print( "+ Detail information for each sample could be identified in separate folders:" ) for folder in givenList: print('\t + ', folder) protein_files.extend( HCGB_main.retrieve_matching_files(folder, '.faa', Debug)) ### report generation if (options.skip_report): print("+ No annotation report generation...") else: ### report generation HCGB_aes.boxymcboxface("Annotation report") outdir_report = HCGB_files.create_subfolder("report", outdir) PROKKA_report = HCGB_files.create_subfolder("annotation", outdir_report) print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## check if previously report generated filename_stamp = PROKKA_report + '/.success' done = 0 if os.path.isdir(PROKKA_report): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous report generated results on: %s" % stamp, 'yellow')) done = 1 ## generate report if done == 0: ## get subdirs generated and call multiQC report module multiQC_report.multiQC_module_call(givenList, "Prokka", PROKKA_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## success stamps filename_stamp = PROKKA_report + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## time stamp start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total) ## Check each annotation using BUSCO results = qc.BUSCO_check(input_dir, outdir, options, start_time_partial_BUSCO, "proteins") ## print to file: results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Annotation module.") return ()
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect, retrieve_databases): """Generate MLST profile identification This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`. :param retrieve_databases: :type options: :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :type dataFrame_edirect: pandas.DataFrame() :type retrieve_databases: pandas.DataFrame() :return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information. :rtype: Dictionary See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here: .. include:: ../../devel/results/doMLST_result_example.csv :literal: .. seealso:: Additional information to PubMLST available datasets. - :doc:`PubMLST datasets<../../../data/PubMLST_datasets>` .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.MLSTar.run_MLSTar` - :func:`HCGB.sampleParser.files.get_files` - :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species` .. include:: ../../links.inc """ ## set config rscript = set_config.get_exe("Rscript") ## TODO: Samples might not be assembled...to take into account and return 0 ## TODO: Fix and install MLSTar during installation print(MLSTar.get_MLSTar_package_installed()) exit() ######################################################################################## ## TODO: What to do if multi-isolate sample? ## TODO: Control if a different profile is provided via --MLST_profile ## TODO: Check time passed and download again if >?? days passed] ## debug message if (Debug): print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow')) print(dataFrame_edirect) ## MLST call HCGB_aes.boxymcboxface("MLST typing") print( "+ Create classical MLST typification of each sample according to species retrieved by kmer..." ) ## get assembly files input_dir = os.path.abspath(options.input) assembly_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow')) print(assembly_samples_retrieved) # init MLST_results = {} ## get MLST_profile: default or provided mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] == 'PubMLST']['path'].tolist() if (Debug): print("** Debug **") print("mlst_profile_list") print(mlst_profile_list) print("dataFrame_edirect") print(dataFrame_edirect) ## Generate MLST call according to species identified for each sample for index, row in dataFrame_edirect.iterrows(): MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'], row['species']) if (MLSTar_taxa_name == 'NaN'): print( colored( "\t- Not available PubMLST profile for sample [%s] identified as %s %s" % (row['sample'], row['genus'], row['species']), 'yellow')) else: for mlst_profile in mlst_profile_list: ## species folder #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder) species_mlst = mlst_profile.split(',')[0] species_mlst_folder = mlst_profile.split(',')[1] ## output file output_file = species_mlst_folder + '/PubMLST_available_scheme.csv' filename_stamp = species_mlst_folder + '/.success_scheme' ## if MLSTar_taxa_name == species_mlst: if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s" % stamp, 'yellow')) else: ### get scheme available MLSTar.getPUBMLST(MLSTar_taxa_name, rscript, output_file) stamp = HCGB_time.print_time_stamp(filename_stamp) ## parse and get scheme for classical MLST schemes_MLST = pd.read_csv(output_file, sep=',', header=0) ## for item, cluster in schemes_MLST.iterrows(): if cluster['len'] < 10: scheme2use = int(cluster['scheme']) continue ### sample = row['sample'] MLSTar_folder = HCGB_files.create_subfolder( 'MLST', outdir_dict[sample]) genome_file = assembly_samples_retrieved.loc[ assembly_samples_retrieved['name'] == sample]['sample'].values[0] ## call MLST (results, profile_folder) = MLSTar.run_MLSTar( species_mlst_folder, rscript, MLSTar_taxa_name, scheme2use, sample, MLSTar_folder, genome_file, options.threads) MLST_results[sample] = results ## print("+ Finish this step...") return (MLST_results)
def check_db_indexed(index_name, folder): """ Check the status of a database :param index_name: Index name for the database :param folder: Absolute path of the folder containing the database. :type index_name: string :type folder: string :returns: True/False for the index status. .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.readList_fromFile` - :func:`BacterialTyper.scripts.functions.get_number_lines` - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.print_time_stamp` """ # Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name my_index_list = [".comp.b", ".index.b", ".length.b", ".name", ".seq.b"] print ("\t+ Checking if database has been previously indexed...") for sufix in my_index_list: ##print (sufix) my_file = index_name + sufix if os.path.isfile(my_file): print ("\t" + my_file + ' exists...') else: if (sufix == '.index.b'): continue else: return(False) ## check if previously assembled and succeeded filename_stamp = folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tDatabase was generated on: %s" %stamp, 'yellow')) ## Check if necessary to download again after several months/days days_passed = HCGB_time.get_diff_time(filename_stamp) print ("\t\t** %s days ago" %days_passed) ## download again if (days_passed > 60): print ("\t\t** Downloading information again just to be sure...") return(False) ## dump in screen names = index_name + '.name' count = HCGB_main.get_number_lines(names) print ("\n\t+ Database seems OK and contains several entries (%s):\n" %count) if (count > 50): print ("\tToo many entries in the database.\n\tCheck file %s for further details." %names) else: entries = HCGB_main.readList_fromFile(names) print (*entries, sep='\n') return(True)
def parse_information(arg_dict, df_accID, outdir): ### Parse df_accID dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug) dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') print("dict_input_folders") print(dict_input_folders) print("dict_parse_folders") print(dict_parse_folders) ## parse each sample retrieved for sample, folder_input in dict_input_folders.items(): if (arg_dict.debug): debug_message('sample: ' + sample, 'yellow') debug_message('folder_input: ' + folder_input, 'yellow') debug_message('folder_parse: ' + dict_parse_folders[sample], 'yellow') debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'], 'yellow') debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow') ## timestamps input_timestamp = os.path.join(folder_input, '.success') parse_timestamp = os.path.join(dict_parse_folders[sample], '.success') print() print("\t+ Parsing sample: " + sample) if (not HCGB_files.is_non_zero_file(parse_timestamp) and not HCGB_files.is_non_zero_file(input_timestamp)): ## TODO: Set threads to use in parallel process_OK = parse_annot_file(sample, folder_input, df_accID.loc[sample, 'annot_file'], dict_parse_folders[sample], arg_dict.debug, df_accID.loc[sample, 'genome']) if (process_OK): ## link or copy annotation file into folder_input HCGB_files.get_symbolic_link_file( df_accID.loc[sample, 'annot_file'], folder_input) ## add df_accID.loc[sample,] information as csv into input folder df_accID.loc[sample, ].to_csv(os.path.join( folder_input, 'info.csv'), index=True, header=True) ## print time stamp HCGB_time.print_time_stamp(input_timestamp) ## print time stamp HCGB_time.print_time_stamp(parse_timestamp) else: print( colored( "\t+ Some error occurred for sample %s while parsing input options" % sample, 'red')) ## print time stamp HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail')) ## print time stamp HCGB_time.print_time_stamp( os.path.join(dict_parse_folders[sample], '.fail')) else: read_time = HCGB_time.read_time_stamp(parse_timestamp) print( colored( "\t+ Input parsing already available for sample %s [%s]" % (sample, read_time), 'green')) print()
def run_search(arg_dict): """Main function of the search module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.blast_help): info.blast_help() exit() if (arg_dict.project_help): info.project_help() exit() if (arg_dict.detached_mode_help): info.detached_mode() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Search module") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## project or detached? if arg_dict.detached: arg_dict.project = False ## output folder print("\n+ Create output folder(s):") HCGB.functions.files_functions.create_folder(outdir) else: arg_dict.project = True ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting information provided... ') print('+ Several options available:') print('\t* BacDup project folder with initiated data') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\t* Single/Multiple raw BLAST results files') print('\t* Single/Multiple fasta proteins + annotation table') print("""\n\n**** NOTE: **** For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs) use the input module to accommodate accordingly """) time.sleep(1) print() ## parse options pd_samples_retrieved = parse_search_options(arg_dict) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## for each sample dict_search_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug) dict_dup_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug) dict_parse_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "parse", arg_dict.debug) ## create results data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table()) for sample, folder in dict_search_folders.items(): annot_timestamp = os.path.join(dict_dup_folders[sample], '.annot_success') dup_annot_file = os.path.join(dict_dup_folders[sample], 'dup_annot.csv') ## annotation annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table'] if (not HCGB.functions.files_functions.is_non_zero_file( annot_timestamp)): ## get results file_data = pd_samples_retrieved.loc[sample, 'file_data'] format = pd_samples_retrieved.loc[sample, 'format'] filtered_data = dup_searcher.filter_data( sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug) ## timestamps filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success') if (not HCGB.functions.files_functions.is_non_zero_file( filter_timestamp)): #save results as a .csv file sort_csv = os.path.abspath( os.path.join(dict_dup_folders[sample], 'filtered_results.csv')) filtered_data.to_csv(sort_csv, header=True, index=False) ## print time stamp HCGB_time.print_time_stamp(filter_timestamp) else: read_time = HCGB_time.read_time_stamp(filter_timestamp) print( colored( "\t+ Filter results already available for sample %s [%s]" % (sample, read_time), 'green')) ## get annotation (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot( sample, filtered_data, annot_table_file, arg_dict.debug) ## info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv') data2add_entry.to_csv(info_dup_file, header=True, index=False) ## save into file dup_annot_df.to_csv(dup_annot_file, header=True) ## print time stamp HCGB_time.print_time_stamp(annot_timestamp) else: read_time = HCGB_time.read_time_stamp(annot_timestamp) print( colored( "\t+ Duplicate annotation already available for sample %s [%s]" % (sample, read_time), 'green')) ## add info for each dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0") annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0") data2add_entry = dup_searcher.get_dup_stats( sample, dup_annot_df, annot_table, arg_dict.debug) ## add genome length data data2add_entry['genome_len'] = '' len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv') if os.path.isfile(len_df_file): len_data = HCGB_main.get_data(len_df_file, ',', "header=None") data2add_entry['genome_len'] = len_data[1].sum() ## merge data #data2add_entry = data2add_entry.reset_index() data2add = data2add.append(data2add_entry, ignore_index=False) ### report generation HCGB_aes.boxymcboxface("Summarizing duplicated search") outdir_report = HCGB.functions.files_functions.create_subfolder( "report", outdir) dups_report = HCGB.functions.files_functions.create_subfolder( "dups", outdir_report) ## add data2add data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting search module.") return ()
def send_kma_job(outdir_file, list_files, name, database, threads, Debug): """ Executes KMA identification jobs This function automates the process of checking if any previous run succeeded or runs the appropiate identification process for the sample and database provided. :param outdir_file: :param list_files: :param name: :param database: :param threads: :param dataFrame_sample: :type outdir_file: :type list_files: :type name: :type database: :type threads: :type dataFrame_sample: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call` - :func:`BacterialTyper.module.ident.get_outfile` - :func:`BacterialTyper.scripts.functions.read_time_stamp` """ if (Debug): print(colored("**DEBUG: ident.send_kma_job call**", 'yellow')) print("outdir_file") print(outdir_file) print("list_files") print(list_files) print("name: " + name) print("database: " + database) ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file) ## set defaults kma_bin = set_config.get_exe("kma") ## get outfile outfile = get_outfile(outdir_dict_kma, name, database) ## check if previously run and succeeded basename_tag = os.path.basename(outfile) filename_stamp = outdir_dict_kma + '/.success_' + basename_tag if (Debug): print("Outdir: ", outdir_dict_kma) print("outfile: ", outfile) print("Filename_stamp: ", filename_stamp) if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) else: ## debug message if (Debug): print( colored( "**DEBUG: species_identification_KMA.kma_ident_module call**", 'yellow')) print("outfile = get_outfile(outdir_dict_kma, name, db2use)") print("outfile: ", outfile) print( "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) " ) print("species_identification_KMA.kma_ident_module" + "\t" + outfile + "\t" + str(list_files) + "\t" + name + "\t" + database + "\t" + str(threads) + "\n") ## Sparse or not #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']): # if (basename_tag == 'userData_KMA'): # option = '' # else: # option = '-Sparse ' ## Add option to retrieve databse from memory option = "" option = option + '-shm 1' # Call KMA species_identification_KMA.kma_ident_call(outfile, list_files, name, database, kma_bin, option, threads) stamp = HCGB_time.print_time_stamp(filename_stamp)