def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Generating Upper Bound Assembly...") if not reads_analyzer.compile_reads_analyzer_tools(logger): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...' ) return None if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Upper Bound Assembly on this platform ' '(only linux64 and macOS are supported), skipping...') return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to install/download third-party repeat finding tool [Red]), skipping...' ) return None insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) if long_reads: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, mp_polished_suffix) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size: calculated_insert_size = qconfig.optimal_assembly_insert_size result_fpath = result_fpath.replace('is' + str(insert_size), 'is' + str(calculated_insert_size)) prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace( 'is' + str(insert_size), 'is' + str(calculated_insert_size)) insert_size = calculated_insert_size ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=long_reads) if unique_covered_regions is None: logger.error( ' Failed to create Upper Bound Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_bed( uncovered_fpath) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretical Upper Bound Assembly is saved to ' + result_fpath) logger.notice( '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n' '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). ' 'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n' '\t\tOR\n' '\tYou can copy ' + result_fpath + ' to ' + ref_prepared_optimal_assembly + '. ' 'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference (' + original_ref_fpath + ') and ' 'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size ' + str(insert_size) + '), ' 'QUAST will reuse this Upper Bound Assembly.\n') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Optimal Assembly...") uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile( ref_prepared_optimal_assembly): already_done_fpath = result_fpath if os.path.isfile( result_fpath) else ref_prepared_optimal_assembly logger.notice( ' Will reuse already generated Optimal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Optimal Assembly on this platform, skipping...' ) return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Optimal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath) if unique_covered_regions is None: logger.error( ' Failed to create Optimal Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_uncovered_fpath( uncovered_fpath, ref_fpath, return_covered_regions=False ) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretically optimal Assembly saved to ' + result_fpath) logger.notice( 'You can copy it to ' + ref_prepared_optimal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath