def select_closely_related(sketch_path, genus, mash_screen, contig_name, threads, output_dir, mash_threshold, download_contig_nums, contig_id): if sketch_path: if mash_screen: mash_file = mash.screen(contig_name, sketch_path, threads, output_dir, mash_threshold, download_contig_nums, contig_id) else: mash_file = mash.dist(contig_name, sketch_path, threads, output_dir, mash_threshold, download_contig_nums, contig_id) ncbi_id = mash.get_ncbi_id(mash_file) if len(ncbi_id ) < 5: #Would'nt polish if closely-related genomes less than 5 return False url_list = download.parser_url(ncbi_id) if genus: ncbi_id, url_list = download.parser_genus(genus) db_path = download.download(output_dir, ncbi_id, url_list) return db_path
def mainloop(): utils.debug(2, '-----mainloop(start)-----', utils.whoami()) for i in dllist: curversion = str(utils.config.get(i, 'current_version')) downloadcfg = str(utils.config.get(i, 'download')) file_regex = str(utils.config.get(i, 'file_regex')) linkprovider = str(utils.config.get(i, 'link_provider')) chkresult = '' if str(downloadcfg) == '1': try: create = importlib.import_module(linkprovider) dllink = create.link(i, file_regex, curversion) except ModuleNotFoundError: print('No link provider %s found.' % linkprovider) print('Please check your config again.') sys.exit(2) except Exception as e: print('There was an exception in the %s module.' % linkprovider) print(e) sys.exit(2) if dllink: utils.debug(2, 'Download link is %s' % dllink, utils.whoami()) file_name = utils.file_name(dllink) utils.debug(2, 'Filename is %s' % file_name, utils.whoami()) if file_name == curversion: utils.debug( 1, 'We already have the current version for %s.' % i, utils.whoami()) else: dlinfo = download.download(dllink, dlpath) if hasattr(create, 'chkurl'): utils.debug( 3, 'Starting checksum check for %s' % linkprovider, utils.whoami()) chkresult = create.chksum(file_name, dlpath) # utils.linkinfo may return an Exception with HTTP Error code. Dont update current_version then. if not utils.is_number(dlinfo) and chkresult is not 'Fail': utils.config.set(i, 'current_version', file_name) utils.debug( 2, 'Current_version is: ' + utils.config.get(i, 'current_version'), utils.whoami()) with open(utils.configfile, 'w') as configfile: utils.config.write(configfile) elif utils.is_number(dlinfo): print('Download failed with HTTP error code: %s' % dlinfo) else: print('Download failed due to checksum mismatch.') else: utils.debug(4, 'No download link found for %s' % i, utils.whoami()) else: utils.debug(2, 'Download for %s is set to off.' % i, utils.whoami()) utils.debug(2, '-----mainloop(stop)-----', utils.whoami())
def _load_data(self, filename, offset): """ Load the data in the given file. Automatically downloads the file if it does not already exist in the data_dir. :param filename: Name of the data-file. :param offset: Start offset in bytes when reading the data-file. :return: The data as a numpy array. """ # Download the file from the internet if it does not exist locally. download(base_url=base_url, filename=filename, download_dir=self.data_dir) # Read the data-file. path = os.path.join(self.data_dir, filename) with gzip.open(path, 'rb') as f: data = np.frombuffer(f.read(), np.uint8, offset=offset) return data
def download_action(ncbi_id, homologous_output_dir, contig_name=None): download_start_time = time.time() print_system_log('Download closely-related genomes') url_list = download.parser_url(ncbi_id) sys.stderr.write(TextColor.GREEN + " INFO: " + str(len(url_list)) + " homologous sequence need to download: \n" + TextColor.END) db_path = download.download(homologous_output_dir, ncbi_id, url_list, contig_name) download_end_time = time.time() download_time = get_elapsed_time_string(download_start_time, download_end_time) print_stage_time('Download closely-related genomes time', download_time) return db_path
def start_bot(bot, update): msg = update.message.text msg = str(msg) user_id = update.message.from_user.id user_id = str(user_id) if "|" in msg: user_cmd, url = msg.split("|") user_cmd = user_cmd.strip() user_cmd = user_cmd.lower() url = url.strip() else: user_cmd = None url = msg #site = requests.get(url) #if site.status_code == 200: #site = httplib.HTTPConnection(url) #site.request("HEAD", '') #if site.getresponse().status == 200: sent_message = bot.send_message(chat_id=update.message.chat_id, text=Text.VERIFYING_URL) time.sleep(1) if validators.url(url): sent_message.edit_text(Text.PROCESSING) if user_cmd: if (user_cmd == "video"): filename = download_video.download(url) if "ERROR" in filename: sent_message.edit_text(Text.FAILED + filename, parse_mode=telegram.ParseMode.HTML) else: bot.send_chat_action(chat_id=update.message.chat_id, action=telegram.ChatAction.TYPING) sent_message.edit_text(Text.UPLOADING_GD) dwnld_url = upload.upload(filename) size = (os.path.getsize(filename)) / 1048576 sent_message.edit_text(Text.DONE.format( filename, size, dwnld_url), parse_mode=telegram.ParseMode.HTML) os.remove(filename) elif (user_cmd == "audio"): if ("youtube" in url or "youtu" in url): filename = download_audio.download(url) if "ERROR" in filename: sent_message.edit_text( Text.FAILED + filename, parse_mode=telegram.ParseMode.HTML) else: bot.send_chat_action(chat_id=update.message.chat_id, action=telegram.ChatAction.TYPING) sent_message.edit_text(Text.UPLOADING_TG) audio = open(filename, 'rb') bot.send_audio(chat_id=update.message.chat_id, audio=audio, caption=filename.replace(".mp3", "")) audio.close() os.remove(filename) sent_message.edit_text(Text.DONE) else: sent_message.edit_text(Text.NOT_SUPPORTED, parse_mode=telegram.ParseMode.HTML) else: if download.is_downloadable(url): size = download.check_filesize(url) if size <= 10000: filename = user_cmd raw_file = download.download(url, filename) if "ERROR" in raw_file: sent_message.edit_text( Text.FAILED + raw_file, parse_mode=telegram.ParseMode.HTML) else: bot.send_chat_action( chat_id=update.message.chat_id, action=telegram.ChatAction.TYPING) sent_message.edit_text(Text.UPLOADING_GD) dwnld_url = upload.upload(raw_file) sent_message.edit_text( Text.DONE.format(raw_file, size, dwnld_url), parse_mode=telegram.ParseMode.HTML) else: sent_message.edit_text(Text.MAXLIMITEXCEEDED) else: sent_message.edit_text(Text.ISNOT_DOWNLOADABLE, parse_mode=telegram.ParseMode.HTML) else: if download.is_downloadable(url): size = download.check_filesize(url) / 1048576 if size <= 10000: raw_file = download.download(url, None) bot.send_chat_action(chat_id=update.message.chat_id, action=telegram.ChatAction.TYPING) sent_message.edit_text(Text.UPLOADING_GD) dwnld_url = upload.upload(raw_file) sent_message.edit_text(Text.DONE.format( raw_file, size, dwnld_url), parse_mode=telegram.ParseMode.HTML) else: sent_message.edit_text(Text.MAXLIMITEXCEEDED) else: sent_message.edit_text(Text.ISNOT_DOWNLOADABLE, parse_mode=telegram.ParseMode.HTML) elif ("help" not in url and "start" not in url and "broadcast" not in url and "donate" not in url and "add_user" not in url and "revoke_user" not in url): bot.send_chat_action(chat_id=update.message.chat_id, action=telegram.ChatAction.TYPING) time.sleep(1) sent_message.edit_text(Text.RETARD)
def polish_genome(assembly, model_path, sketch_path, genus, threads, output_dir, minimap_args, mash_threshold, download_contig_nums, debug): out = [] output_dir = FileManager.handle_output_directory(output_dir) contig_output_dir_debug = output_dir + '/debug' contig_output_dir_debug = FileManager.handle_output_directory( contig_output_dir_debug) assembly_name = assembly.rsplit('/', 1)[-1] assembly_name = assembly_name.split('.')[0] total_start_time = time.time() for contig in SeqIO.parse(assembly, 'fasta'): timestr = time.strftime("[%Y/%m/%d %H:%M]") sys.stderr.write(TextColor.GREEN + str(timestr) + " INFO: RUN-ID: " + contig.id + "\n" + TextColor.END) contig_output_dir = contig_output_dir_debug + '/' + contig.id contig_output_dir = FileManager.handle_output_directory( contig_output_dir) contig_name = contig_output_dir + '/' + contig.id + '.fasta' SeqIO.write(contig, contig_name, "fasta") if sketch_path: screen_start_time = time.time() print_system_log('MASH SCREEN') mash_file = mash.screen(contig_name, sketch_path, threads, contig_output_dir, mash_threshold, download_contig_nums, contig.id) screen_end_time = time.time() ncbi_id = mash.get_ncbi_id(mash_file) if len( ncbi_id ) < 5: #Would'nt polish if closely-related genomes less than 5 out.append(contig_name) continue url_list = download.parser_url(ncbi_id) if genus: ncbi_id, url_list = download.parser_genus(genus) download_start_time = time.time() print_system_log('DOWNLOAD CONTIGS') db = download.download(contig_output_dir, ncbi_id, url_list) download_end_time = time.time() pileup_start_time = time.time() print("\n") print_system_log('PILE UP') db_npz = alignment.align(contig_name, minimap_args, threads, db, contig_output_dir) if db_npz == False: continue pileup_end_time = time.time() align2df_start_time = time.time() print_system_log('TO DATAFRAME') df = align2df.todf(contig_name, db_npz, contig_output_dir) align2df_end_time = time.time() predict_start_time = time.time() print_system_log('PREDICT') df = contig_output_dir + '/' + contig.id + '.feather' result = predict.predict(df, model_path, threads, contig_output_dir) predict_end_time = time.time() polish_start_time = time.time() print_system_log('POLISH') finish = polish.stitch(contig_name, result, contig_output_dir) polish_end_time = time.time() if sketch_path: screen_time = get_elapsed_time_string(screen_start_time, screen_end_time) print_stage_time('SCREEN', screen_time) #calculating time download_time = get_elapsed_time_string(download_start_time, download_end_time) pileup_time = get_elapsed_time_string(pileup_start_time, pileup_end_time) align2df_time = get_elapsed_time_string(align2df_start_time, align2df_end_time) predict_time = get_elapsed_time_string(predict_start_time, predict_end_time) polish_time = get_elapsed_time_string(polish_start_time, polish_end_time) #print stage time print_stage_time('DOWNLOAD', download_time) print_stage_time('PILEUP', pileup_time) print_stage_time('TO DATAFRAME', align2df_time) print_stage_time('PREDICT', predict_time) print_stage_time('POLISH', polish_time) out.append(finish) os.system('cat {} > {}/{}_homopolished.fasta'.format( ' '.join(out), output_dir, assembly_name)) if debug: try: shutil.rmtree(contig_output_dir_debug) except OSError as e: print(e) else: return True total_end_time = time.time() total_time = get_elapsed_time_string(total_start_time, total_end_time) print_stage_time('Total', total_time)
def make_train_data(mash_screen, assembly, reference, sketch_path, genus_species, threads, output_dir, minimap_args, mash_threshold, download_contig_nums, debug): output_dir = FileManager.handle_output_directory(output_dir) contig_output_dir_debug = make_output_dir("debug", output_dir) assembly_name = assembly.rsplit('/', 1)[-1] assembly_name = assembly_name.split('.')[0] total_start_time = time.time() for contig in SeqIO.parse(assembly, 'fasta'): timestr = time.strftime("[%Y/%m/%d %H:%M]") sys.stderr.write(TextColor.GREEN + str(timestr) + " INFO: RUN-ID: " + contig.id + "\n" + TextColor.END) contig_output_dir = make_output_dir("contig", contig_output_dir_debug, contig.id) contig_name = contig_output_dir + '/' + contig.id + '.fasta' SeqIO.write(contig, contig_name, "fasta") print_system_log('Select closely-related genomes and download') collect_start_time = time.time() #db_path = mash_select_closely_related(sketch_path, mash_screen, threads, contig_output_dir, mash_threshold, download_contig_nums, contig_name, contig.id) ncbi_id = mash_select_closely_related(sketch_path, mash_screen, threads, contig_output_dir, mash_threshold, download_contig_nums, contig_name, contig.id) ''' if len(ncbi_id) < 5: sys.stderr.write(TextColor.PURPLE + "This contig " + contig.id + " closely-related genome is less than 5, not to polish...\n" + TextColor.END) out.append(contig_name) continue ''' collect_end_time = time.time() collect_time = get_elapsed_time_string(collect_start_time, collect_end_time) #print_stage_time('Select closely-related genomes and download', collect_time) print_system_log('Download closely-related genomes') url_list = download.parser_url(ncbi_id) sys.stderr.write(TextColor.GREEN + " INFO: " + str(len(url_list)) + " homologous sequence need to download: \n" + TextColor.END) db_path = download.download(contig_output_dir, ncbi_id, url_list) seq_paf = alignment.align(contig_name, minimap_args, threads, db_path, contig_output_dir) ref_paf = alignment.align(contig_name, minimap_args, threads, db_path, contig_output_dir, reference) if os.stat(seq_paf).st_size != 0 and os.stat(ref_paf).st_size != 0: record = SeqIO.read(contig_name, "fasta") genome_size = len(record) dataframe_path = homologous_retrieval(seq_paf, genome_size, contig_output_dir, contig.id, contig_name, ref_paf) else: sys.stderr.write(TextColor.PURPLE + contig.id + " minimap2 can't align......\n" + TextColor.END) shutil.move(dataframe_path, output_dir + '/' + assembly_name + '.feather')