def download(path, ncbi_id, url_list): db_dir = path + '/homologous_sequences/' db_dir = FileManager.handle_output_directory(db_dir) max_pool_size = 3 #API rate limit exceeded, can't go higher cpus = multiprocessing.cpu_count() pool = multiprocessing.Pool( cpus if cpus < max_pool_size else max_pool_size) for id, url in zip(ncbi_id, url_list): pool.apply_async(run_process, args=(id, url, db_dir)) pool.close() pool.join() file_path = db_dir + '/*' db_path = path + '/All_homologous_sequences.fna.gz' os.system('cat {} > {}'.format(file_path, db_path)) return db_path
def polish_genome(assembly, model_path, sketch_path, genus, threads, output_dir, minimap_args, mash_threshold, download_contig_nums, debug): out = [] output_dir = FileManager.handle_output_directory(output_dir) contig_output_dir_debug = output_dir + '/debug' contig_output_dir_debug = FileManager.handle_output_directory( contig_output_dir_debug) assembly_name = assembly.rsplit('/', 1)[-1] assembly_name = assembly_name.split('.')[0] total_start_time = time.time() for contig in SeqIO.parse(assembly, 'fasta'): timestr = time.strftime("[%Y/%m/%d %H:%M]") sys.stderr.write(TextColor.GREEN + str(timestr) + " INFO: RUN-ID: " + contig.id + "\n" + TextColor.END) contig_output_dir = contig_output_dir_debug + '/' + contig.id contig_output_dir = FileManager.handle_output_directory( contig_output_dir) contig_name = contig_output_dir + '/' + contig.id + '.fasta' SeqIO.write(contig, contig_name, "fasta") if sketch_path: screen_start_time = time.time() print_system_log('MASH SCREEN') mash_file = mash.screen(contig_name, sketch_path, threads, contig_output_dir, mash_threshold, download_contig_nums, contig.id) screen_end_time = time.time() ncbi_id = mash.get_ncbi_id(mash_file) if len( ncbi_id ) < 5: #Would'nt polish if closely-related genomes less than 5 out.append(contig_name) continue url_list = download.parser_url(ncbi_id) if genus: ncbi_id, url_list = download.parser_genus(genus) download_start_time = time.time() print_system_log('DOWNLOAD CONTIGS') db = download.download(contig_output_dir, ncbi_id, url_list) download_end_time = time.time() pileup_start_time = time.time() print("\n") print_system_log('PILE UP') db_npz = alignment.align(contig_name, minimap_args, threads, db, contig_output_dir) if db_npz == False: continue pileup_end_time = time.time() align2df_start_time = time.time() print_system_log('TO DATAFRAME') df = align2df.todf(contig_name, db_npz, contig_output_dir) align2df_end_time = time.time() predict_start_time = time.time() print_system_log('PREDICT') df = contig_output_dir + '/' + contig.id + '.feather' result = predict.predict(df, model_path, threads, contig_output_dir) predict_end_time = time.time() polish_start_time = time.time() print_system_log('POLISH') finish = polish.stitch(contig_name, result, contig_output_dir) polish_end_time = time.time() if sketch_path: screen_time = get_elapsed_time_string(screen_start_time, screen_end_time) print_stage_time('SCREEN', screen_time) #calculating time download_time = get_elapsed_time_string(download_start_time, download_end_time) pileup_time = get_elapsed_time_string(pileup_start_time, pileup_end_time) align2df_time = get_elapsed_time_string(align2df_start_time, align2df_end_time) predict_time = get_elapsed_time_string(predict_start_time, predict_end_time) polish_time = get_elapsed_time_string(polish_start_time, polish_end_time) #print stage time print_stage_time('DOWNLOAD', download_time) print_stage_time('PILEUP', pileup_time) print_stage_time('TO DATAFRAME', align2df_time) print_stage_time('PREDICT', predict_time) print_stage_time('POLISH', polish_time) out.append(finish) os.system('cat {} > {}/{}_homopolished.fasta'.format( ' '.join(out), output_dir, assembly_name)) if debug: try: shutil.rmtree(contig_output_dir_debug) except OSError as e: print(e) else: return True total_end_time = time.time() total_time = get_elapsed_time_string(total_start_time, total_end_time) print_stage_time('Total', total_time)