def create_successful_check(fpath, contigs_fpath, ref_fpath): successful_check_file = open(fpath, 'w') successful_check_file.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) successful_check_file.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) successful_check_file.write("Successfully finished on " + datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n') successful_check_file.close()
def create_successful_check(fpath, contigs_fpath, ref_fpath): successful_check_file = open(fpath, 'w') successful_check_file.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) successful_check_file.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) successful_check_file.write("Successfully finished on " + datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n') successful_check_file.close()
def check_successful_check(fpath, contigs_fpath, ref_fpath): successful_check_content = open(fpath).read().split('\n') if len(successful_check_content) < 2: return False if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False return True
def check_successful_check(fpath, contigs_fpath, ref_fpath): successful_check_content = open(fpath).read().split('\n') if len(successful_check_content) < 2: return False if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False return True
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') if not exists(kmc_check_fpath): return False successful_check_content = open(kmc_check_fpath).read().split('\n') if len(successful_check_content) < 2: return False if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False return True
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads): logger.info(' ' + 'processing ' + label) blast_query_fpath = contigs_fpath compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip'] if any(contigs_fpath.endswith(ext) for ext in compress_ext): logger.info(' ' + 'unpacking ' + label) unpacked_fpath = os.path.join( corrected_dirpath, os.path.basename(contigs_fpath) + '.unpacked') with _get_fasta_file_handler(contigs_fpath) as f_in: with open(unpacked_fpath, 'w') as f_out: for l in f_in: f_out.write(l) blast_query_fpath = unpacked_fpath res_fpath = get_blast_output_fpath(blast_res_fpath, label) check_fpath = get_blast_output_fpath(blast_check_fpath, label) cmd = get_blast_fpath('blastn') + ( ' -query %s -db %s -outfmt 7 -num_threads %s' % (blast_query_fpath, db_fpath, blast_threads)) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s md5 checksum: %s\n' % (contigs_fpath, md5(contigs_fpath)))
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels) species_list = [] replacement_list = None if ref_txt_fpath: species_list = parse_refs_list(ref_txt_fpath) species_by_assembly = None else: species_scores, species_by_assembly, replacement_dict = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath) if species_scores: species_scores = sorted(species_scores, reverse=True) species_list = [species for (species, query_id, score) in species_scores] replacement_list = [replacement_dict[query_id] for (species, query_id, score) in species_scores] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = search_references(species_list, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, species_by_assembly, replacement_list) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') if not exists(kmc_check_fpath): return False successful_check_content = open(kmc_check_fpath).read().split('\n') if len(successful_check_content) < 3: return False if successful_check_content[0].strip().split()[-1] != str(md5(contigs_fpath)): return False if successful_check_content[1].strip().split()[-1] != str(md5(ref_fpath)): return False used_assemblies = successful_check_content[2].strip().split(': ')[-1] if used_assemblies and sorted(used_assemblies.split(',')) != sorted(contigs_fpaths): return False return True
def create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, completeness, len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') kmc_stats_fpath = join(output_dir, label + '.stat') with open(kmc_check_fpath, 'w') as check_f: check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) check_f.write("Used assemblies: %s\n" % ','.join(contigs_fpaths)) with open(kmc_stats_fpath, 'w') as stats_f: stats_f.write("Completeness: %s\n" % completeness) if len_map_to_one_chrom or len_map_to_multi_chrom: stats_f.write("Length assigned to one chromosome: %d\n" % len_map_to_one_chrom) stats_f.write("Length assigned to multi chromosomes: %d\n" % len_map_to_multi_chrom) stats_f.write("Length assigned to none chromosome: %d\n" % len_map_to_none_chrom) stats_f.write("Total length: %d\n" % total_len)
def do(assemblies, labels, downloaded_dirpath, corrected_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') files_md5 = dict((assembly.fpath, md5(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels) organisms = [] if ref_txt_fpath: organisms = parse_refs_list(ref_txt_fpath) organisms_assemblies = None else: scores_organisms, organisms_assemblies = process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath) if scores_organisms: scores_organisms = sorted(scores_organisms, reverse=True) organisms = [organism for (score, organism) in scores_organisms] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def create_kmc_stats_file(output_dir, contigs_fpath, ref_fpath, completeness, corr_len, mis_len, undef_len, total_len, translocations, relocations): label = qutils.label_from_fpath_for_fname(contigs_fpath) kmc_check_fpath = join(output_dir, label + '.sf') kmc_stats_fpath = join(output_dir, label + '.stat') with open(kmc_check_fpath, 'w') as check_f: check_f.write("Assembly md5 checksum: %s\n" % md5(contigs_fpath)) check_f.write("Reference md5 checksum: %s\n" % md5(ref_fpath)) with open(kmc_stats_fpath, 'w') as stats_f: stats_f.write("Completeness: %s\n" % completeness) if corr_len or mis_len: stats_f.write("K-mer-based correct length: %d\n" % corr_len) stats_f.write("K-mer-based misjoined length: %d\n" % mis_len) stats_f.write("K-mer-based undefined length: %d\n" % undef_len) stats_f.write("Total length: %d\n" % total_len) stats_f.write("# translocations: %d\n" % translocations) stats_f.write("# 100 kbp relocations: %d\n" % relocations)
def search_references(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies=None, replacement_list=None): ref_fpaths = [] downloaded_organisms = [] total_downloaded = 0 total_scored_left = len(organisms) if total_scored_left == 0: if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) return ref_fpaths max_organism_name_len = 0 for organism in organisms: max_organism_name_len = max(len(organism), max_organism_name_len) for organism in downloaded_organisms: max_organism_name_len = max(len(organism), max_organism_name_len) logger.print_timestamp() logger.main_info('Trying to download found references from NCBI. Totally ' + str(total_scored_left) + ' organisms to try.') if len(downloaded_ref_fpaths) > 0: logger.main_info('MetaQUAST will attempt to use previously downloaded references...') for idx, organism in enumerate(organisms): ref_fpath, total_downloaded, total_scored_left = process_ref(ref_fpaths, organism, downloaded_dirpath, max_organism_name_len, downloaded_organisms, not_founded_organisms, total_downloaded, total_scored_left) if not ref_fpath and replacement_list: for next_match in replacement_list[idx]: if next_match not in organisms: logger.main_info(' ' + organism.replace('+', ' ') + ' was not found in NCBI database, trying to download the next best match') ref_fpath, total_downloaded, _ = process_ref(ref_fpaths, next_match, downloaded_dirpath, max_organism_name_len, downloaded_organisms, not_founded_organisms, total_downloaded, total_scored_left + 1) organism = next_match if ref_fpath: break for assembly, label in zip(assemblies, labels): check_fpath = get_blast_output_fpath(blast_check_fpath, label) if os.path.exists(check_fpath): with open(check_fpath) as check_file: text = check_file.read() text = text[:text.find('\n')] else: text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines(text) check_file.writelines('\n---\n') cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \ [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]] cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \ [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]] check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms)) check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms)) return ref_fpaths
def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies=None): ref_fpaths = [] downloaded_organisms = [] total_downloaded = 0 total_scored_left = len(organisms) if total_scored_left == 0: if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) return ref_fpaths max_organism_name_len = 0 for organism in organisms: max_organism_name_len = max(len(organism), max_organism_name_len) for organism in downloaded_organisms: max_organism_name_len = max(len(organism), max_organism_name_len) logger.print_timestamp() logger.main_info('Trying to download found references from NCBI. ' 'Totally ' + str(total_scored_left) + ' organisms to try.') if len(downloaded_ref_fpaths) > 0: logger.main_info('MetaQUAST will attempt to use previously downloaded references...') for organism in organisms: ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta') spaces = (max_organism_name_len - len(organism)) * ' ' new_ref_fpath = None was_downloaded = False if not os.path.exists(ref_fpath) and organism not in not_founded_organisms: new_ref_fpath = download_refs(organism, ref_fpath) elif os.path.exists(ref_fpath): was_downloaded = True new_ref_fpath = ref_fpath if new_ref_fpath: total_scored_left -= 1 total_downloaded += 1 if was_downloaded: logger.main_info(" %s%s | was downloaded previously (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) if new_ref_fpath not in ref_fpaths: ref_fpaths.append(new_ref_fpath) else: logger.main_info(" %s%s | successfully downloaded (total %d, %d more to go)" % (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left)) ref_fpaths.append(new_ref_fpath) downloaded_organisms.append(organism) else: total_scored_left -= 1 logger.main_info(" %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces)) not_founded_organisms.add(organism) for assembly, label in zip(assemblies, labels): check_fpath = get_blast_output_fpath(blast_check_fpath, label) if os.path.exists(check_fpath): with open(check_fpath) as check_file: text = check_file.read() text = text[:text.find('\n')] else: text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines(text) check_file.writelines('\n---\n') cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \ [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]] cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \ [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]] check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms)) check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms)) return ref_fpaths