def download_db(logger, is_prokaryote, is_fungus=False, only_clean=False): if is_prokaryote: url = bacteria_db_url clade = 'bacteria' elif is_fungus: url = fungi_db_url clade = 'fungi' else: url = eukaryota_db_url clade = 'eukaryota' dirpath = get_dir_for_download('busco', 'Busco databases', [clade], logger, only_clean=only_clean) if not dirpath: return None db_dirpath = join(dirpath, clade) if only_clean: if os.path.isdir(db_dirpath): shutil.rmtree(db_dirpath, ignore_errors=True) return True if not os.path.exists(db_dirpath): downloaded_fpath = join(dirpath, clade + '.tar.gz') logger.main_info(' Downloading BUSCO database...') download_unpack_compressed_tar(clade + ' database', url, downloaded_fpath, db_dirpath, logger) if not os.path.exists(db_dirpath): logger.warning('Failed to download ' + clade + ' database from ' + url + ' and unpack it into ' + dirpath) return None return db_dirpath
def download_tool(tool, tool_version, required_files, logger, url, only_clean=False): tool_dirpath = get_dir_for_download(tool + tool_version, tool, required_files, logger, only_clean=only_clean) if not tool_dirpath: return None if only_clean: if os.path.isdir(tool_dirpath): shutil.rmtree(tool_dirpath, ignore_errors=True) return tool_dirpath failed_compilation_flag = join(tool_dirpath, 'make.failed') if not all(os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files) and not \ check_prev_compilation_failed(tool, failed_compilation_flag): downloaded_fpath = join(tool_dirpath, tool + '.tar.gz') logger.main_info(' Downloading third-party tools...') download_unpack_compressed_tar(tool, url, downloaded_fpath, tool_dirpath, logger) if not all( os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files): logger.warning('Failed to download ' + tool + ' from ' + url + 'and unpack it into ' + tool_dirpath) return None return tool_dirpath
def download_db(logger, is_prokaryote, is_fungus=False, only_clean=False): if is_prokaryote: url = bacteria_db_url clade = 'bacteria' elif is_fungus: url = fungi_db_url clade = 'fungi' else: url = eukaryota_db_url clade = 'eukaryota' dirpath = get_dir_for_download('busco', 'Busco databases', [clade], logger, only_clean=only_clean) if not dirpath: return None db_dirpath = join(dirpath, clade) if only_clean: if os.path.isdir(db_dirpath): shutil.rmtree(db_dirpath, ignore_errors=True) return True if not os.path.exists(db_dirpath): downloaded_fpath = join(dirpath, clade + '.tar.gz') logger.main_info(' Downloading ' + clade + ' database...') download_unpack_compressed_tar(clade + ' database', url, downloaded_fpath, db_dirpath, logger) if not os.path.exists(db_dirpath): logger.warning('Failed to download ' + clade + ' database from ' + url + 'and unpack it into ' + dirpath) return None return db_dirpath
def download_manta(logger, bed_fpath=None, only_clean=False): global manta_dirpath manta_dirpath = get_dir_for_download('manta' + manta_version, 'Manta', [config_manta_relpath], logger, only_clean=only_clean) if not manta_dirpath: return False manta_build_dirpath = join(manta_dirpath, 'build') config_manta_fpath = get_manta_fpath() if only_clean: if os.path.isdir(manta_build_dirpath): shutil.rmtree(manta_build_dirpath, ignore_errors=True) return True if not qconfig.no_sv and bed_fpath is None and not isfile( config_manta_fpath): if qconfig.platform_name == 'linux_64': url = manta_linux_url fpath = manta_ext_linux_fpath elif qconfig.platform_name == 'macosx': url = manta_osx_url fpath = manta_ext_osx_fpath else: logger.warning('Manta is not available for your platform.') return False if not exists(manta_build_dirpath): os.makedirs(manta_build_dirpath) manta_downloaded_fpath = join(manta_build_dirpath, 'manta.tar.bz2') if isfile(fpath): logger.info('Copying manta from ' + fpath) shutil.copy(fpath, manta_downloaded_fpath) logger.info('Unpacking ' + manta_downloaded_fpath + ' into ' + manta_build_dirpath) unpack_tar(manta_downloaded_fpath, manta_build_dirpath) else: failed_compilation_flag = join(manta_dirpath, 'make.failed') if check_prev_compilation_failed('Manta', failed_compilation_flag): print_manta_warning(logger) return False logger.main_info(' Downloading binary distribution of Manta...') download_unpack_tar_bz('Manta', url, manta_downloaded_fpath, manta_build_dirpath, logger) manta_demo_dirpath = join(manta_build_dirpath, 'share', 'demo') if os.path.isdir(manta_demo_dirpath): shutil.rmtree(manta_demo_dirpath, ignore_errors=True) if not isfile(config_manta_fpath): logger.warning( 'Failed to download binary distribution from https://github.com/ablab/quast/external_tools/manta ' 'and unpack it into ' + join(manta_dirpath, 'build/')) print_manta_warning(logger) return False return True
def download_gridss(logger, bed_fpath=None, only_clean=False): global gridss_dirpath gridss_dirpath = get_dir_for_download('gridss', 'GRIDSS', [gridss_fname], logger, only_clean=only_clean) if not gridss_dirpath: return False gridss_fpath = get_gridss_fpath() if not qconfig.no_sv and bed_fpath is None and not isfile(gridss_fpath): if not download_external_tool(gridss_fname, gridss_dirpath, 'gridss'): logger.warning('Failed to download binary distribution from https://github.com/ablab/quast/external_tools/gridss. ' 'QUAST SV module will be able to search trivial deletions only.') return False return True
def download_gridss(logger, bed_fpath=None, only_clean=False): global gridss_dirpath gridss_dirpath = get_dir_for_download('gridss', 'GRIDSS', [gridss_fname], logger, only_clean=only_clean) if not gridss_dirpath: return False gridss_fpath = get_gridss_fpath() if not qconfig.no_sv and bed_fpath is None and not isfile(gridss_fpath): if not download_external_tool(gridss_fname, gridss_dirpath, 'gridss'): logger.warning( 'Failed to download binary distribution from https://github.com/ablab/quast/external_tools/gridss. ' 'QUAST SV module will be able to search trivial deletions only.' ) return False return True
def download_tool(tool, tool_version, required_files, logger, url, only_clean=False): tool_dirpath = get_dir_for_download(tool + tool_version, tool, required_files, logger, only_clean=only_clean) if not tool_dirpath: return None if only_clean: if os.path.isdir(tool_dirpath): shutil.rmtree(tool_dirpath, ignore_errors=True) return tool_dirpath failed_compilation_flag = join(tool_dirpath, 'make.failed') if not all(os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files) and not \ check_prev_compilation_failed(tool, failed_compilation_flag): downloaded_fpath = join(tool_dirpath, tool + '.tar.gz') logger.main_info(' Downloading ' + tool + '...') download_unpack_compressed_tar(tool, url, downloaded_fpath, tool_dirpath, logger) if not all(os.path.exists(join(tool_dirpath, fpath)) for fpath in required_files): logger.warning('Failed to download ' + tool + ' from ' + url + 'and unpack it into ' + tool_dirpath) return None return tool_dirpath
def download_gridss(logger, bed_fpath=None, only_clean=False): global gridss_dirpath gridss_dirpath = get_dir_for_download('gridss', 'GRIDSS', [gridss_fname], logger, only_clean=only_clean) if not gridss_dirpath: return False if only_clean: if os.path.isdir(gridss_dirpath): shutil.rmtree(gridss_dirpath, ignore_errors=True) return True gridss_fpath = get_gridss_fpath() if not qconfig.no_sv and bed_fpath is None and not isfile(gridss_fpath): if not download_external_tool(gridss_fname, gridss_dirpath, 'gridss'): logger.warning( 'Failed to download binary distribution from https://github.com/ablab/quast/tree/master/external_tools/gridss. ' 'QUAST SV module will be able to search trivial deletions only. ' 'You can try to download it manually, save the jar archive under %s, and restart QUAST.' % gridss_dirpath) return False return True
def download_all_blast_binaries(logger=logger, only_clean=False): global blast_dirpath required_files = [cmd for cmd in blast_filenames if not get_blast_fpath(cmd)] if not required_files and not only_clean: return True blast_dirpath = get_dir_for_download('blast', 'BLAST', blast_filenames, logger, only_clean=only_clean) if not blast_dirpath: return False if only_clean: if os.path.isdir(blast_dirpath): shutil.rmtree(blast_dirpath, ignore_errors=True) return True for i, cmd in enumerate(required_files): return_code = download_blast_binary(cmd, logger=logger) logger.info() if return_code != 0: return False blast_file = get_blast_fpath(cmd) os.chmod(blast_file, os.stat(blast_file).st_mode | stat.S_IEXEC) return True
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Generating Upper Bound Assembly...") if not reads_analyzer.compile_reads_analyzer_tools(logger): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...' ) return None if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Upper Bound Assembly on this platform ' '(only linux64 and macOS are supported), skipping...') return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to install/download third-party repeat finding tool [Red]), skipping...' ) return None insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) if long_reads: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, mp_polished_suffix) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size: calculated_insert_size = qconfig.optimal_assembly_insert_size result_fpath = result_fpath.replace('is' + str(insert_size), 'is' + str(calculated_insert_size)) prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace( 'is' + str(insert_size), 'is' + str(calculated_insert_size)) insert_size = calculated_insert_size ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=long_reads) if unique_covered_regions is None: logger.error( ' Failed to create Upper Bound Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_bed( uncovered_fpath) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretical Upper Bound Assembly is saved to ' + result_fpath) logger.notice( '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n' '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). ' 'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n' '\t\tOR\n' '\tYou can copy ' + result_fpath + ' to ' + ref_prepared_optimal_assembly + '. ' 'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference (' + original_ref_fpath + ') and ' 'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size ' + str(insert_size) + '), ' 'QUAST will reuse this Upper Bound Assembly.\n') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def download_blastdb(logger=logger, only_clean=False): global blastdb_dirpath blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean) if not blastdb_dirpath: return False if only_clean: if os.path.isdir(blastdb_dirpath): logger.info('Removing ' + blastdb_dirpath) shutil.rmtree(blastdb_dirpath) return True global db_fpath db_fpath = join(blastdb_dirpath, silva_downloaded_fname) if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize: return True log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info( 'SILVA 16S ribosomal RNA gene database has already been downloaded.' ) else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) silva_download = urllib.FancyURLopener() silva_remote_fpath = silva_db_url + silva_fname + '.gz' try: silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress) except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually in %s and restart your command.' % (silva_remote_fpath, blastdb_dirpath)) return False shutil.move(db_gz_fpath + '.download', db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not os.path.isfile(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) substituted_fpath = silva_fpath + ".substituted" with open(unpacked_fpath) as in_file: with open(substituted_fpath, 'w') as out_file: for line in in_file: out_file.write(line.replace(' ', '_')) os.remove(unpacked_fpath) shutil.move(substituted_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return False elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return True
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() kmer_len = qconfig.unique_kmer_len logger.main_info('Running analysis based on unique ' + str(kmer_len) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 7: corr_len = int(stats_content[1].strip().split(': ')[-1]) mis_len = int(stats_content[2].strip().split(': ')[-1]) undef_len = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) translocations = int(stats_content[5].strip().split(': ')[-1]) relocations = int(stats_content[6].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: save_kmers(output_dir) logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists( kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info(' Running KMC on reference...') if not isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath + '. Skipping...') return logger.info(' Analyzing assemblies completeness...') kmc_out_fpaths = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers( tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info(' Analyzing assemblies correctness...') ref_contigs = [name for name, _ in read_fasta(ref_fpath)] logger.info(' Downsampling k-mers...') ref_kmers, downsampled_kmers_fpath = downsample_kmers( tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath, err_fpath) for id, (contigs_fpath, kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) corr_len = None mis_len = None undef_len = None translocations, relocations = None, None total_len = 0 contig_lens = dict() for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning( 'Reference is too fragmented. Scaffolding accuracy will not be assessed.' ) else: corr_len = 0 mis_len = 0 kmers_by_contig, kmers_pos_by_contig = align_kmers( tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath, qconfig.max_threads) is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref cyclic_ref_lens = report.get_field( reporting.Fields.REFLEN) if is_cyclic else None translocations = 0 relocations = 0 with open( join( tmp_dirpath, qutils.label_from_fpath_for_fname(contigs_fpath) + '.misjoins.txt'), 'w') as out: for contig in kmers_by_contig.keys(): contig_markers = [] prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None for pos, kmer in sorted(zip(kmers_pos_by_contig[contig], kmers_by_contig[contig]), key=lambda x: x[0]): ref_chrom, ref_pos = ref_kmers[kmer] if prev_pos and prev_chrom: if prev_chrom == ref_chrom and abs( abs(pos - prev_pos) / abs(ref_pos - prev_ref_pos) - 1) <= 0.05: marker = (pos, ref_pos, ref_chrom) elif marker: contig_markers.append(marker) pos, ref_pos, ref_chrom, marker = None, None, None, None prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if marker: contig_markers.append(marker) prev_pos, prev_ref_pos, prev_chrom = None, None, None is_misassembled = False for marker in contig_markers: pos, ref_pos, ref_chrom = marker if prev_pos and prev_chrom: if ref_chrom != prev_chrom: translocations += 1 out.write( 'Translocation in %s: %s %d | %s %d\n' % (contig, prev_chrom, prev_pos, ref_chrom, pos)) is_misassembled = True elif _get_dist_inconstistency( pos, prev_pos, ref_pos, prev_ref_pos, cyclic_ref_lens) > EXT_RELOCATION_SIZE: relocations += 1 out.write( 'Relocation in %s: %d (%d) | %d (%d)\n' % (contig, prev_pos, prev_ref_pos, pos, ref_pos)) is_misassembled = True prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if is_misassembled: mis_len += contig_lens[contig] elif len(contig_markers) > 0: corr_len += contig_lens[contig] undef_len = total_len - corr_len - mis_len report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) create_kmc_stats_file( output_dir, contigs_fpath, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len, mis_len, undef_len, total_len, translocations, relocations) save_kmers(output_dir) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 0.001 ref_contigs = [name for name, _ in read_fasta(ref_fpath)] ref_kmc_dbs = [] if len(ref_contigs) <= MAX_REF_CONTIGS_NUM: shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) for name, seq in read_fasta(ref_fpath): seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True, intersect_with=shared_downsampled_kmc_db) ref_kmc_dbs.append((name, seq_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) label = qutils.label_from_fpath_for_fname(contigs_fpath) list_files_fpath = join(tmp_dirpath, label + '_files.txt') with open(list_files_fpath, 'w') as list_files: for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') with open(tmp_contig_fpath, 'w') as out_f: out_f.write('>%s\n' % name) out_f.write('%s\n' % seq) list_files.write(tmp_contig_fpath + '\n') if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta') filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS) filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt') with open(filtered_list_files_fpath, 'w') as list_files: for name, _ in read_fasta(filtered_fpath): tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') list_files.write(tmp_contig_fpath + '\n') for ref_name, ref_kmc_db in ref_kmc_dbs: tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta') filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS) if exists(tmp_filtered_fpath): for name, _ in read_fasta(tmp_filtered_fpath): contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Optimal Assembly...") uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile( ref_prepared_optimal_assembly): already_done_fpath = result_fpath if os.path.isfile( result_fpath) else ref_prepared_optimal_assembly logger.notice( ' Will reuse already generated Optimal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Optimal Assembly on this platform, skipping...' ) return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Optimal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath) if unique_covered_regions is None: logger.error( ' Failed to create Optimal Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_uncovered_fpath( uncovered_fpath, ref_fpath, return_covered_regions=False ) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretically optimal Assembly saved to ' + result_fpath) logger.notice( 'You can copy it to ' + ref_prepared_optimal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def download_blastdb(logger=logger, only_clean=False): global blastdb_dirpath blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean) if not blastdb_dirpath: return False if only_clean: if os.path.isdir(blastdb_dirpath): logger.info('Removing ' + blastdb_dirpath) shutil.rmtree(blastdb_dirpath) return True global db_fpath db_fpath = join(blastdb_dirpath, silva_downloaded_fname) if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize: return True log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info('SILVA 16S ribosomal RNA gene database has already been downloaded.') else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) silva_download = urllib.FancyURLopener() silva_remote_fpath = silva_db_url + silva_fname + '.gz' try: silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress) except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually in %s and restart your command.' % (silva_remote_fpath, blastdb_dirpath)) return False shutil.move(db_gz_fpath + '.download', db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not os.path.isfile(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) substituted_fpath = silva_fpath + ".substituted" with open(unpacked_fpath) as in_file: with open(substituted_fpath, 'w') as out_file: for line in in_file: out_file.write(line.replace(' ', '_')) os.remove(unpacked_fpath) shutil.move(substituted_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return False elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return True