Exemplo n.º 1
0
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
    total_len = dict()
    contigs_dict = dict()

    contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
    for contigs_fpath in contigs_fpaths:
        total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
        contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
    cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
    num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]

    common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
    histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
    plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
                               histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
    for contigs_fpath in contigs_with_coverage:
        coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
                                                                                             [contigs_dict[contigs_fpath]])
        label = qutils.label_from_fpath(contigs_fpath)
        corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
        histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
        histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
        plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
                                   histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
                                   low_threshold=low_threshold, high_threshold=high_threshold)
Exemplo n.º 2
0
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
    total_len = dict()
    contigs_dict = dict()

    contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
    for contigs_fpath in contigs_fpaths:
        total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
        contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
    cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
    num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]

    common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
    histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
    plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
                               histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
    for contigs_fpath in contigs_with_coverage:
        coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
                                                                                             [contigs_dict[contigs_fpath]])
        label = qutils.label_from_fpath(contigs_fpath)
        corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
        histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
        histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
        plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
                                   histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
                                   low_threshold=low_threshold, high_threshold=high_threshold)
Exemplo n.º 3
0
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks):
    assemblies_n50 = defaultdict(dict)
    assemblies_data = ''
    assemblies_data += 'var assemblies_links = {};\n'
    assemblies_data += 'var assemblies_len = {};\n'
    assemblies_data += 'var assemblies_contigs = {};\n'
    assemblies_data += 'var assemblies_misassemblies = {};\n'
    assemblies_data += 'var assemblies_n50 = {};\n'
    assemblies_contig_size_data = ''
    for contigs_fpath in contigs_fpaths:
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        report = reporting.get(contigs_fpath)
        l = report.get_field(reporting.Fields.TOTALLEN)
        contigs = report.get_field(reporting.Fields.CONTIGS)
        n50 = report.get_field(reporting.Fields.N50)
        if stdout_pattern:
            contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout'
            contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath)
            assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n'
        assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n'
        assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n'
        assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n'
        for nx in nx_marks:
            assemblies_n50[assembly_label][nx] = report.get_field(nx)
    return assemblies_data, assemblies_contig_size_data, assemblies_n50
Exemplo n.º 4
0
def calculate_ave_read_support(combined_output_dirpath, assemblies):
    unique_contigs_fpath = os.path.join(combined_output_dirpath,
                                        'contigs_reports',
                                        qconfig.unique_contigs_fname_pattern)
    for assembly in assemblies:
        aligned_contigs_by_ref = dict()
        assembly_label = qutils.label_from_fpath(assembly.fpath)
        corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
        with open(unique_contigs_fpath % corr_assembly_label) as in_f:
            for line in in_f:
                ref_name, contig_len, contig_cov = line.strip().split('\t')
                aligned_contigs_by_ref.setdefault(ref_name, []).append(
                    (float(contig_len), float(contig_cov)))
        for ref_name, contigs in aligned_contigs_by_ref.items():
            ref_cov = sum(contig_cov * aligned_len
                          for (aligned_len, contig_cov) in contigs)
            ref_cov /= sum(aligned_len
                           for (aligned_len, contig_cov) in contigs)
            corr_assembly_label = qutils.label_from_fpath_for_fname(
                assembly.fpath)
            ref_contigs_fpath = os.path.join(
                os.path.dirname(assembly.fpath),
                corr_assembly_label + '_to_' + ref_name + '.fasta')
            qconfig.assembly_labels_by_fpath[
                ref_contigs_fpath] = assembly_label
            report = reporting.get(ref_contigs_fpath, ref_name=ref_name)
            report.add_field(reporting.Fields.AVE_READ_SUPPORT,
                             '%.2f' % ref_cov)
Exemplo n.º 5
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if qconfig.memory_efficient:
        results = Parallel(n_jobs=n_jobs)(
            delayed(predict_genes)(index, contigs_fpath, gene_lengths,
                                   out_dirpath, tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths))
    else:
        results = [
            predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                          tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths)
        ]

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label], unique, full_genes, partial_genes = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if full_genes is not None:
            genes = [
                '%s + %s part' % (full_cnt, partial_cnt)
                for full_cnt, partial_cnt in zip(full_genes, partial_genes)
            ]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique is None and full_genes is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' +
                ('Run with the --debug option'
                 ' to see the command line.' if not qconfig.debug else '') %
                label)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Exemplo n.º 6
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running Barrnap...')

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    log_fpath = join(output_dir, 'barrnap.log')
    logger.info('Logging to ' + log_fpath + '...')
    kingdom = 'bac' if qconfig.prokaryote else 'euk'
    gff_fpaths = [
        join(output_dir,
             qutils.label_from_fpath_for_fname(contigs_fpath) + '.rna.gff')
        for contigs_fpath in contigs_fpaths
    ]

    barrnap_args = [
        (contigs_fpath, gff_fpath, log_fpath, threads, kingdom)
        for contigs_fpath, gff_fpath in zip(contigs_fpaths, gff_fpaths)
    ]
    run_parallel(run, barrnap_args, qconfig.max_threads)

    if not any(fpath for fpath in gff_fpaths):
        logger.info('Failed predicting the location of ribosomal RNA genes.')
        return

    # saving results
    for index, (contigs_fpath,
                gff_fpath) in enumerate(zip(contigs_fpaths, gff_fpaths)):
        genes = parse_gff(open(gff_fpath), 'rrna')
        report = reporting.get(contigs_fpath)

        if not os.path.isfile(gff_fpath):
            logger.error('Failed running Barrnap for ' + contigs_fpath +
                         '. See ' + log_fpath + ' for information.')
            continue

        part_count = len([
            gene for gene in genes if 'product' in gene.attributes
            and 'partial' in gene.attributes['product']
        ])
        total_count = len(genes)
        report.add_field(
            reporting.Fields.RNA_GENES,
            '%s + %s part' % (total_count - part_count, part_count))

        logger.info('  ' + qutils.index_to_str(index) +
                    '  Ribosomal RNA genes = ' + str(total_count))
        logger.info('  ' + qutils.index_to_str(index) +
                    '  Predicted genes (GFF): ' + gff_fpath)

    logger.info('Done.')
Exemplo n.º 7
0
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath):
    from quast_libs import reporting

    ref_reads_stats = None
    if ref_fpath:
        ref_name = qutils.name_from_fpath(ref_fpath)
        stats_fpath = join(output_dir, ref_name + '.stat')
        if isfile(stats_fpath):
            ref_reads_stats = parse_reads_stats(stats_fpath)
            if int(ref_reads_stats['mapped']) == 0:
                logger.info('  BWA: nothing aligned for reference.')

    # process all contigs files
    for index, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        stats_fpath = join(output_dir, assembly_name + '.stat')
        if ref_reads_stats:
            report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped'])
            report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt'])
            report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons'])
            report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt'])
            report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth'])
            if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
                report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS,
                                [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
                report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0])
        if not isfile(stats_fpath):
            continue
        reads_stats = parse_reads_stats(stats_fpath)
        report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total'])
        report.add_field(reporting.Fields.LEFT_READS, reads_stats['left'])
        report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right'])
        report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped'])
        report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt'])
        if int(reads_stats['mapped']) == 0:
            logger.info('  ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.')
        report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons'])
        report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt'])
        report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint'])
        report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt'])
        report.add_field(reporting.Fields.DEPTH, reads_stats['depth'])
        if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
            report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS,
                            [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
            report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
Exemplo n.º 8
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if qconfig.memory_efficient:
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths))
    else:
        results = [predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath)
                   for index, contigs_fpath in enumerate(contigs_fpaths)]

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label], unique, full_genes, partial_genes = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if full_genes is not None:
            genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes)]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique is None and full_genes is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else '') % label)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Exemplo n.º 9
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath,
                      tool_dirpath, tool_exec_fpath, tmp_dirpath)
                     for index, contigs_fpath in enumerate(contigs_fpaths)]
    genes_list, unique, full_genes, partial_genes = run_parallel(
        predict_genes, parallel_args, n_jobs)

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label] = genes_list[i]
        if unique[i] is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                             unique[i])
        if full_genes[i] is not None:
            genes = [
                '%s + %s part' % (full_cnt, partial_cnt) for full_cnt,
                partial_cnt in zip(full_genes[i], partial_genes[i])
            ]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique[i] is None and full_genes[i] is None:
            logger.error('Failed running Glimmer for %s. ' % label + (
                'Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else ''))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Exemplo n.º 10
0
def calculate_ave_read_support(combined_output_dirpath, assemblies):
    unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern)
    for assembly in assemblies:
        aligned_contigs_by_ref = dict()
        assembly_label = qutils.label_from_fpath(assembly.fpath)
        corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
        with open(unique_contigs_fpath % corr_assembly_label) as in_f:
            for line in in_f:
                ref_name, contig_len, contig_cov = line.strip().split('\t')
                aligned_contigs_by_ref.setdefault(ref_name, []).append((float(contig_len), float(contig_cov)))
        for ref_name, contigs in aligned_contigs_by_ref.items():
            ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs)
            ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs)
            corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath)
            ref_contigs_fpath = os.path.join(
                        os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta')
            qconfig.assembly_labels_by_fpath[ref_contigs_fpath] = assembly_label
            report = reporting.get(ref_contigs_fpath, ref_name=ref_name)
            report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
Exemplo n.º 11
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath)
                     for index, contigs_fpath in enumerate(contigs_fpaths)]
    genes_list, unique, full_genes, partial_genes = run_parallel(predict_genes, parallel_args, n_jobs)

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label] = genes_list[i]
        if unique[i] is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i])
        if full_genes[i] is not None:
            genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i])]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique[i] is None and full_genes[i] is None:
            logger.error(
                'Failed running Glimmer for %s. ' % label + ('Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else ''))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Exemplo n.º 12
0
def js_data_gen(assemblies,
                contigs_fpaths,
                chromosomes_length,
                output_dirpath,
                structures_by_labels,
                contigs_by_assemblies,
                ambiguity_alignments_by_labels=None,
                contig_names_by_refs=None,
                ref_fpath=None,
                stdout_pattern=None,
                features_data=None,
                gc_fpath=None,
                cov_fpath=None,
                physical_cov_fpath=None,
                json_output_dir=None):
    chr_names = []
    if chromosomes_length and assemblies:
        chr_to_aligned_blocks = OrderedDict()
        chr_names = list(chromosomes_length.keys())
        for assembly in assemblies.assemblies:
            chr_to_aligned_blocks[assembly.label] = defaultdict(list)
            similar_correct = 0
            similar_misassembled = 0

            for align in assembly.alignments:
                chr_to_aligned_blocks[assembly.label][align.ref_name].append(
                    align)
                if align.similar:
                    if align.misassembled:
                        similar_misassembled += 1
                    else:
                        similar_correct += 1
            report = reporting.get(assembly.fpath)
            report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct)
            report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS,
                             similar_misassembled)

    main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname)
    output_all_files_dir_path = os.path.join(output_dirpath,
                                             qconfig.icarus_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)

    chr_full_names, contig_names_by_refs = group_references(
        chr_names, contig_names_by_refs, chromosomes_length, ref_fpath)

    cov_data, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names,
                                          contig_names_by_refs)
    physical_cov_data, physical_max_depth = parse_cov_fpath(
        physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs)
    gc_data, max_gc = parse_cov_fpath(gc_fpath, chr_names, chr_full_names,
                                      contig_names_by_refs)

    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    nx_marks = [
        reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50,
        reporting.Fields.NG75
    ]

    assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(
        contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks)

    ref_contigs_dict = {}
    chr_lengths_dict = {}

    ref_data = 'var references_by_id = {};\n'
    chr_names_by_id = dict(
        (chrom, str(i)) for i, chrom in enumerate(chr_names))
    for chrom, i in chr_names_by_id.items():
        ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n'
    for i, chr in enumerate(chr_full_names):
        if contig_names_by_refs:
            ref_contigs = [
                contig for contig in chr_names
                if contig_names_by_refs[contig] == chr
            ]
        elif len(chr_full_names) == 1:
            ref_contigs = chr_names
        else:
            ref_contigs = [chr]
        ref_contigs_dict[chr] = ref_contigs
        chr_lengths_dict[chr] = [0] + [
            chromosomes_length[contig] for contig in ref_contigs
        ]

    num_misassemblies = defaultdict(int)
    aligned_bases_by_chr = defaultdict(list)
    aligned_assemblies = defaultdict(set)
    for i, chr in enumerate(chr_full_names):
        ref_contigs = ref_contigs_dict[chr]
        chr_lengths = chr_lengths_dict[chr]
        chr_size = sum([chromosomes_length[contig] for contig in ref_contigs])
        chr_sizes[chr] = chr_size
        num_contigs[chr] = len(ref_contigs)
        data_str = []
        data_str.append('var chromosomes_len = {};')
        for ref_contig in ref_contigs:
            l = chromosomes_length[ref_contig]
            data_str.append('chromosomes_len["' + ref_contig + '"] = ' +
                            str(l) + ';')
            aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig])

        cov_data_str = format_cov_data(chr, cov_data, 'coverage_data',
                                       max_depth,
                                       'reads_max_depth') if cov_data else None
        physical_cov_data_str = format_cov_data(chr, physical_cov_data, 'physical_coverage_data', physical_max_depth, 'physical_max_depth') \
            if physical_cov_data else None
        gc_data_str = format_cov_data(chr, gc_data, 'gc_data', 100,
                                      'max_gc') if gc_data else None

        alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \
            prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels,
                                               contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
                                               cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, gc_data_str=gc_data_str,
                                               contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path)
        ref_name = qutils.name_from_fpath(ref_fpath)
        save_alignment_data_for_one_ref(
            chr,
            ref_contigs,
            ref_name,
            json_output_dir,
            alignment_viewer_fpath,
            ref_data_str,
            ms_selectors,
            ref_data=ref_data,
            features_data=features_data,
            assemblies_data=assemblies_data,
            contigs_structure_str=contigs_structure_str,
            additional_assemblies_data=additional_assemblies_data)

    contigs_sizes_str, too_many_contigs = get_contigs_data(
        contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels,
        contig_names_by_refs, chr_names, chr_full_names)
    all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str
    save_contig_size_html(output_all_files_dir_path, json_output_dir,
                          too_many_contigs, all_data)

    icarus_links = defaultdict(list)
    if len(chr_full_names) > 1:
        chr_link = qconfig.icarus_html_fname
        icarus_links["links"].append(chr_link)
        icarus_links["links_names"].append(qconfig.icarus_link)

    main_menu_template_fpath = html_saver.get_real_path(
        qconfig.icarus_menu_template_fname)
    main_data_dict = dict()

    labels = [
        qconfig.assembly_labels_by_fpath[contigs_fpath]
        for contigs_fpath in contigs_fpaths
    ]
    main_data_dict['assemblies'] = labels
    html_saver.save_icarus_data(json_output_dir, ', '.join(labels),
                                'assemblies')

    contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname,
                                             qconfig.contig_size_viewer_fname)
    main_data_dict['contig_size_html'] = contig_size_browser_fpath
    html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath,
                                'contig_size_html')
    if not chr_names:
        icarus_links["links"].append(contig_size_browser_fpath)
        icarus_links["links_names"].append(qconfig.icarus_link)

    if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref):
        main_data_dict['table_references'] = {'references': []}
        num_aligned_assemblies = [
            len(aligned_assemblies[chr]) for chr in chr_full_names
        ]
        is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
        if is_unaligned_asm_exists:
            main_data_dict['table_references']['th_assemblies'] = True
        for chr in sorted(chr_full_names, key=natural_sort):
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(
                chr,
                aligned_bases_by_chr,
                chr_sizes,
                contigs_fpaths,
                contig_names_by_refs,
                one_chromosome=len(chr_full_names) == 1)
            reference_dict = dict()
            reference_dict['chr_link'] = chr_link
            reference_dict['tooltip'] = tooltip
            reference_dict['chr_name'] = os.path.basename(chr_name)
            reference_dict['num_contigs'] = str(num_contigs[chr])
            reference_dict['chr_size'] = format_long_numbers(chr_size)
            if is_unaligned_asm_exists:
                reference_dict['num_assemblies'] = str(
                    len(aligned_assemblies[chr]))
            reference_dict['chr_gf'] = '%.3f' % chr_genome
            reference_dict['num_misassemblies'] = str(num_misassemblies[chr])
            main_data_dict['table_references']['references'].append(
                reference_dict)
        html_saver.save_icarus_data(json_output_dir,
                                    main_data_dict['table_references'],
                                    'table_references',
                                    as_text=False)
    else:
        if chr_full_names:
            chr = chr_full_names[0]
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(
                chr,
                aligned_bases_by_chr,
                chr_sizes,
                contigs_fpaths,
                contig_names_by_refs,
                one_chromosome=True)
            main_data_dict['one_reference'] = dict()
            main_data_dict['one_reference']['alignment_link'] = chr_link
            main_data_dict['one_reference']['ref_fpath'] = os.path.basename(
                ref_fpath)
            main_data_dict['one_reference']['ref_fragments'] = str(
                num_contigs[chr])
            main_data_dict['one_reference']['ref_size'] = format_long_numbers(
                chr_size)
            main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome
            main_data_dict['one_reference']['num_misassemblies'] = str(
                num_misassemblies[chr])
            icarus_links["links"].append(chr_link)
            icarus_links["links_names"].append(qconfig.icarus_link)
            html_saver.save_icarus_data(json_output_dir,
                                        main_data_dict['one_reference'],
                                        'menu_reference',
                                        as_text=False)
    html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath,
                                main_data_dict)
    html_saver.save_icarus_links(output_dirpath, icarus_links)

    return main_menu_fpath
Exemplo n.º 13
0
def do(reference,
       contigs_fpaths,
       cyclic,
       output_dir,
       old_contigs_fpaths,
       bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    num_nf_errors = logger._num_nf_errors

    if not compile_aligner(logger):
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )
        return dict(
            zip(contigs_fpaths,
                [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(int(qconfig.max_threads / n_jobs), 1)
    from joblib import Parallel, delayed
    if not qconfig.splitted_ref:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(
            delayed(align_and_analyze)(cyclic,
                                       i,
                                       contigs_fpath,
                                       output_dir,
                                       reference,
                                       old_contigs_fpath,
                                       bed_fpath,
                                       threads=threads)
            for i, (contigs_fpath, old_contigs_fpath
                    ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(
                qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(
                delayed(align_and_analyze)(cyclic,
                                           i,
                                           contigs_fpath,
                                           output_dir,
                                           reference,
                                           old_contigs_fpath,
                                           bed_fpath,
                                           threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(
                    zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(
                    zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(
                    align_and_analyze(cyclic,
                                      i,
                                      contigs_fpath,
                                      output_dir,
                                      reference,
                                      old_contigs_fpath,
                                      bed_fpath,
                                      parallel_by_chr=True,
                                      threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \
                                         [x[1] for x in statuses_results_lengths_tuples], \
                                         [x[2] for x in statuses_results_lengths_tuples]
    reports = []

    if qconfig.is_combined_ref:
        ref_misassemblies = [
            result['istranslocations_by_refs'] if result else []
            for result in results
        ]
        if ref_misassemblies:
            for i, fpath in enumerate(contigs_fpaths):
                if ref_misassemblies[i]:
                    assembly_name = qutils.name_from_fpath(fpath)
                    all_rows = []
                    all_refs = sorted(
                        list(
                            set([
                                ref
                                for ref in ref_labels_by_chromosomes.values()
                            ])))
                    row = {
                        'metricName': 'References',
                        'values':
                        [ref_num + 1 for ref_num in range(len(all_refs))]
                    }
                    all_rows.append(row)
                    for k in all_refs:
                        row = {'metricName': k, 'values': []}
                        for ref in all_refs:
                            if ref == k or ref not in ref_misassemblies[i]:
                                row['values'].append(None)
                            else:
                                row['values'].append(
                                    ref_misassemblies[i][ref][k])
                        all_rows.append(row)
                    misassembly_by_ref_fpath = join(
                        output_dir,
                        'interspecies_translocations_by_refs_%s.info' %
                        assembly_name)
                    print >> open(
                        misassembly_by_ref_fpath, 'w'
                    ), 'Number of interspecies translocations by references: \n'
                    print_file(all_rows,
                               misassembly_by_ref_fpath,
                               append_to_existing_file=True)

                    print >> open(misassembly_by_ref_fpath,
                                  'a'), '\nReferences: '
                    for ref_num, ref in enumerate(all_refs):
                        print >> open(misassembly_by_ref_fpath,
                                      'a'), str(ref_num + 1) + ' - ' + ref
                    logger.info(
                        '  Information about interspecies translocations by references for %s is saved to %s'
                        % (assembly_name, misassembly_by_ref_fpath))

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
    if qconfig.draw_plots:
        import plotter
        plotter.draw_misassembl_plot(reports,
                                     join(output_dir, 'misassemblies_plot'),
                                     'Misassemblies')

    oks = nucmer_statuses.values().count(NucmerStatus.OK)
    not_aligned = nucmer_statuses.values().count(NucmerStatus.NOT_ALIGNED)
    failed = nucmer_statuses.values().count(NucmerStatus.FAILED)
    errors = nucmer_statuses.values().count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info(
            'Done for ' + str(all - problems) + ' out of ' + str(all) +
            '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 14
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths,
       detailed_contigs_reports_dirpath, genome_stats_dirpath):

    coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname)
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        coords_dirpath = os.path.join(coords_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(ref_fpath)

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt')
    res_file = open(result_fpath, 'w')

    containers = []
    for feature, feature_fpath in features_dict.items():
        containers.append(FeatureContainer([feature_fpath], feature))
    if not features_dict:
        logger.notice('No file with genomic features were provided. '
                      'Use the --features option if you want to specify it.\n', indent='  ')
    if operons_fpaths:
        containers.append(FeatureContainer(operons_fpaths, 'operon'))
    else:
        logger.notice('No file with operons were provided. '
                      'Use the -O option if you want to specify it.', indent='  ')
    for container in containers:
        if not container.fpaths:
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent='  ')
            res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"')
            res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys()))

    ref_genes_num, ref_operons_num = None, None
    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        genomic_features = 0
        for container in containers:
            if container.kind == 'operon':
                ref_operons_num = len(container.region_list)
                report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list))
            else:
                genomic_features += len(container.region_list)
        if genomic_features:
            ref_genes_num = genomic_features
            report.add_field(reporting.Fields.REF_GENES, genomic_features)

    # for cumulative plots:
    files_features_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_unsorted_features_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}
    files_unsorted_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)

    parallel_run_args = [(contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                          reference_chromosomes, ns_by_chromosomes, containers)
                        for index, contigs_fpath in enumerate(aligned_contigs_fpaths)]
    ref_lengths, results_genes_operons_tuples = run_parallel(process_single_file, parallel_run_args, n_jobs, filter_results=True)
    num_nf_errors += len(aligned_contigs_fpaths) - len(ref_lengths)
    logger._num_nf_errors = num_nf_errors
    if not ref_lengths:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' +
                       'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) +
                       ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('=' * 120 + '\n')

    for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\
            in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_features_in_contigs[contigs_fpath] = features_in_contigs
        files_unsorted_features_in_contigs[contigs_fpath] = unsorted_features_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        files_unsorted_operons_in_contigs[contigs_fpath] = unsorted_operons_in_contigs
        full_found_genes.append(sum(features_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], report.get_field(reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count))

        genome_mapped.append(float(report.get_field(reporting.Fields.MAPPEDGENOME)))

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if ref_genes_num:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num)
        if ref_operons_num:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        from quast_libs.ca_utils.misc import contigs_aligned_lengths
        if ref_genes_num:
            plotter.genes_operons_plot(ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs,
                genome_stats_dirpath + '/features_cumulative_plot', 'genomic features')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs,
                             genome_stats_dirpath + '/features_frcurve_plot', 'genomic features')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram',
                '# complete genomic features')
        if ref_operons_num:
            plotter.genes_operons_plot(ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs,
                             genome_stats_dirpath + '/operons_frcurve_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.main_info('Done.')
    return containers
Exemplo n.º 15
0
def save_result(result, report, fname, ref_fpath):
    region_misassemblies = result['region_misassemblies']
    misassemblies_by_ref = result['misassemblies_by_ref']
    region_struct_variations = result['region_struct_variations']
    misassemblies_matched_sv = result['misassemblies_matched_sv']
    misassembled_contigs = result['misassembled_contigs']
    misassembled_bases = result['misassembled_bases']
    misassembly_internal_overlap = result['misassembly_internal_overlap']
    unaligned = result['unaligned']
    partially_unaligned = result['partially_unaligned']
    partially_unaligned_bases = result['partially_unaligned_bases']
    fully_unaligned_bases = result['fully_unaligned_bases']
    ambiguous_contigs = result['ambiguous_contigs']
    ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases']
    SNPs = result['SNPs']
    indels_list = result['indels_list']
    total_aligned_bases = result['total_aligned_bases']
    half_unaligned_with_misassembly = result['half_unaligned_with_misassembly']

    report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL))
    report.add_field(reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) +
                     region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
                     region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases)
    report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap)
    if qconfig.bed:
        report.add_field(reporting.Fields.STRUCT_VARIATIONS, misassemblies_matched_sv)
    report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned))
    report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases))
    report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs)
    report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases)
    report.add_field(reporting.Fields.MISMATCHES, SNPs)
    # different types of indels:
    if indels_list is not None:
        report.add_field(reporting.Fields.INDELS, len(indels_list))
        report.add_field(reporting.Fields.INDELSBASES, sum(indels_list))
        report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD]))
        report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD]))

    if total_aligned_bases:
        report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases)))
        report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS))
                                                                 * 100000.0 / float(total_aligned_bases)))

    # for misassemblies report:
    report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) +
                     region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
                     region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION))
    report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION))
    report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases)
    report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL))
    if qconfig.is_combined_ref:
        report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
        report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        all_references = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        for ref_name in all_references:
            subreport = reporting.get(fname, ref_name=ref_name)
            ref_misassemblies = misassemblies_by_ref[ref_name]
            subreport.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) +
                                ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) +
                                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION))
            subreport.add_field(reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION))
            subreport.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL))
            subreport.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
            subreport.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
            if fname not in qconfig.dict_of_broken_scaffolds:
                subreport.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP))
            if qconfig.check_for_fragmented_ref:
                subreport.add_field(reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED))
    elif intergenomic_misassemblies_by_asm:
        label = qutils.label_from_fpath(fname)
        ref_name = qutils.name_from_fpath(ref_fpath)
        ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name]
        report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
    if fname not in qconfig.dict_of_broken_scaffolds:
        report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP))
    if qconfig.check_for_fragmented_ref:
        report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED))
    # for unaligned report:
    report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned)
    report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned)
    report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly)
    return report
Exemplo n.º 16
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    if qconfig.draw_plots:
        compile_gnuplot(logger, only_clean=False)

    num_nf_errors = logger._num_nf_errors
    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref and not qconfig.memory_efficient:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)]
    reports = []

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if NucmerStatus.OK in nucmer_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 17
0
def main(args):
    check_dirpath(
        qconfig.QUAST_HOME,
        'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n.' +
        'Please, put QUAST in a different directory, then try again.\n',
        exit_code=3)

    if not args:
        qconfig.usage(stream=sys.stderr)
        sys.exit(1)

    try:
        import imp
        imp.reload(qconfig)
        imp.reload(qutils)
    except:
        reload(qconfig)
        reload(qutils)

    try:
        locale.setlocale(locale.LC_ALL, 'en_US.utf8')
    except Exception:
        try:
            locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
        except Exception:
            logger.warning('Python locale settings can\'t be changed')
    quast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args)
    output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    logger.main_info()
    logger.print_params()

    ########################################################################
    from quast_libs import reporting
    reports = reporting.reports
    try:
        import imp
        imp.reload(reporting)
    except:
        reload(reporting)
    reporting.reports = reports
    reporting.assembly_fpaths = []
    from quast_libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..',
                                         qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    qconfig.set_max_threads(logger)
    check_reads_fpaths(logger)
    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        original_ref_fpath = ref_fpath
        ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath)
        if qconfig.optimal_assembly:
            if not qconfig.pacbio_reads and not qconfig.nanopore_reads and not qconfig.mate_pairs:
                logger.warning(
                    'Optimal assembly cannot be created. It requires mate-pairs or long reads (Pacbio SMRT or Oxford Nanopore).'
                )
            else:
                optimal_assembly_fpath = optimal_assembly.do(
                    ref_fpath, original_ref_fpath,
                    os.path.join(output_dirpath,
                                 qconfig.optimal_assembly_basename))
                if optimal_assembly_fpath is not None:
                    contigs_fpaths.insert(0, optimal_assembly_fpath)
                    labels.insert(0, 'Optimal')
                    labels = qutils.process_labels(contigs_fpaths, labels)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(
        contigs_fpaths, corrected_dirpath, labels, reporting)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME,
                         qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    cov_fpath = qconfig.cov_fpath
    physical_cov_fpath = qconfig.phys_cov_fpath
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_sam or qconfig.sam_fpaths or qconfig.bam_fpaths:
        bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(
            ref_fpath,
            contigs_fpaths,
            os.path.join(output_dirpath, qconfig.reads_stats_dirname),
            external_logger=logger)
        qconfig.bed = bed_fpath

    if not contigs_fpaths:
        logger.error(
            "None of the assembly files contains correct contigs. "
            "Please, provide different files or decrease --min-contig threshold.",
            fake_if_nested_run=True)
        return 4

    if qconfig.used_colors and qconfig.used_ls:
        for i, label in enumerate(labels):
            plotter_data.dict_color_and_ls[label] = (qconfig.used_colors[i],
                                                     qconfig.used_ls[i])

    qconfig.assemblies_fpaths = contigs_fpaths

    # Where all pdfs will be saved
    all_pdf_fpath = None
    if qconfig.draw_plots and plotter.can_draw_plots:
        all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)

    if qconfig.json_output_dirpath:
        from quast_libs.html_saver import json_saver
        if json_saver.simplejson_error:
            qconfig.json_output_dirpath = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from quast_libs import basic_stats
    icarus_gc_fpath, circos_gc_fpath = basic_stats.do(
        ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
        output_dirpath)

    if qconfig.large_genome and ref_fpath:
        unique_kmers.do(os.path.join(output_dirpath, 'basic_stats'), ref_fpath,
                        contigs_fpaths, logger)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    icarus_html_fpath = None
    circos_png_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from quast_libs import contigs_analyzer
        is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
        aligner_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, is_cyclic,
            os.path.join(output_dirpath, 'contigs_reports'),
            old_contigs_fpaths, qconfig.bed)
        for contigs_fpath in contigs_fpaths:
            if aligner_statuses[
                    contigs_fpath] == contigs_analyzer.AlignerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(
                    aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if aligner didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    features_containers = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(
            output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from quast_libs import aligned_stats
        aligned_stats.do(ref_fpath, aligned_contigs_fpaths, output_dirpath,
                         aligned_lengths_lists,
                         os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from quast_libs import genome_analyzer
        features_containers = genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath,
            qconfig.features, qconfig.operons,
            detailed_contigs_reports_dirpath,
            os.path.join(output_dirpath, 'genome_stats'))

    genes_by_labels = None
    if qconfig.gene_finding:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from quast_libs import glimmer
            genes_by_labels = glimmer.do(
                contigs_fpaths, qconfig.genes_lengths,
                os.path.join(output_dirpath, 'predicted_genes'))
        if not qconfig.glimmer or qconfig.test:
            ########################################################################
            ### GeneMark
            ########################################################################
            from quast_libs import genemark
            genes_by_labels = genemark.do(
                contigs_fpaths, qconfig.genes_lengths,
                os.path.join(output_dirpath, 'predicted_genes'),
                qconfig.prokaryote, qconfig.metagenemark)
    else:
        logger.main_info("")
        logger.notice(
            "Genes are not predicted by default. Use --gene-finding option to enable it."
        )

    if qconfig.rna_gene_finding:
        run_barrnap.do(contigs_fpaths,
                       os.path.join(output_dirpath, 'predicted_genes'), logger)

    if qconfig.run_busco and not qconfig.is_combined_ref:
        if qconfig.platform_name == 'macosx':
            logger.main_info("")
            logger.warning("BUSCO can be run on Linux only")
        elif sys.version[0:3] == '2.5':
            logger.main_info("")
            logger.warning(
                "BUSCO does not support Python versions older than 2.6.")
        else:
            from quast_libs import run_busco
            run_busco.do(contigs_fpaths,
                         os.path.join(output_dirpath, qconfig.busco_dirname),
                         logger)
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(
        output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots or qconfig.create_icarus_html:
        logger.print_timestamp()
        logger.main_info('Creating large visual summaries...')
        logger.main_info(
            'This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath:
                report_for_icarus_fpath_pattern = os.path.join(
                    detailed_contigs_reports_dirpath,
                    qconfig.icarus_report_fname_pattern)
                stdout_pattern = os.path.join(
                    detailed_contigs_reports_dirpath,
                    qconfig.contig_report_fname_pattern)
            else:
                report_for_icarus_fpath_pattern = None
                stdout_pattern = None
            draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html
            draw_circos_plot = qconfig.draw_plots and ref_fpath and len(
                aligned_contigs_fpaths) and not qconfig.space_efficient
            number_of_steps = sum([
                int(bool(value)) for value in
                [draw_alignment_plots, draw_circos_plot, all_pdf_fpath]
            ])
            if draw_alignment_plots:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info('  1 of %d: Creating Icarus viewers...' %
                                 number_of_steps)
                from quast_libs import icarus
                icarus_html_fpath, contig_alignment_plot_fpath = icarus.do(
                    contigs_fpaths,
                    report_for_icarus_fpath_pattern,
                    output_dirpath,
                    ref_fpath,
                    stdout_pattern=stdout_pattern,
                    features=features_containers,
                    cov_fpath=cov_fpath,
                    physical_cov_fpath=physical_cov_fpath,
                    gc_fpath=icarus_gc_fpath,
                    json_output_dir=qconfig.json_output_dirpath,
                    genes_by_labels=genes_by_labels)

            if draw_circos_plot:
                logger.main_info(
                    '  %d of %d: Creating Circos plots...' %
                    (2 if draw_alignment_plots else 1, number_of_steps))
                from quast_libs import circos
                circos_png_fpath, circos_legend_fpath = circos.do(
                    ref_fpath, contigs_fpaths, report_for_icarus_fpath_pattern,
                    circos_gc_fpath, features_containers, cov_fpath,
                    os.path.join(output_dirpath, 'circos'), logger)

            if all_pdf_fpath:
                # full report in PDF format: all tables and plots
                logger.main_info(
                    '  %d of %d: Creating PDF with all tables and plots...' %
                    (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_fpath)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            if all_pdf_fpath and os.path.isfile(all_pdf_fpath):
                os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' +
                     reports_fpaths)
    logger.main_info(
        '  Text versions of transposed total report are saved to ' +
        transposed_reports_fpaths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths,
                               plotter_data.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig,
                                     ref_fpath)

    if all_pdf_fpath and os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) is saved to ' +
                         all_pdf_fpath)

    if circos_png_fpath:
        logger.main_info(
            '  Circos plot is saved to %s (the annotation is in %s). Circos configuration file is saved to %s'
            % (circos_png_fpath, circos_legend_fpath,
               circos_png_fpath.replace('.png', '.conf')))

    if icarus_html_fpath:
        logger.main_info('  Icarus (contig browser) is saved to %s' %
                         icarus_html_fpath)

    if qconfig.draw_svg and contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot is saved to %s' %
                         contig_alignment_plot_fpath)

    cleanup(corrected_dirpath)
    return logger.finish_up(check_test=qconfig.test)
Exemplo n.º 18
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    kmer_len = qconfig.unique_kmer_len
    logger.main_info('Running analysis based on unique ' + str(kmer_len) +
                     '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath,
                                      contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 7:
                corr_len = int(stats_content[1].strip().split(': ')[-1])
                mis_len = int(stats_content[2].strip().split(': ')[-1])
                undef_len = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                translocations = int(stats_content[5].strip().split(': ')[-1])
                relocations = int(stats_content[6].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                                 '%.2f' % (corr_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                                 '%.2f' % (mis_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                                 '%.2f' % (undef_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                                 translocations)
                report.add_field(reporting.Fields.KMER_RELOCATIONS,
                                 relocations)
                report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                                 translocations + relocations)
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        save_kmers(output_dir)
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC',
                                       ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc',
                                           kmc_dirpath,
                                           'KMC',
                                           platform_specific=True,
                                           is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools',
                                             kmc_dirpath,
                                             'KMC',
                                             platform_specific=True,
                                             is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(
            kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('  Running KMC on reference...')
    if not isdir(output_dir):
        os.makedirs(output_dir)
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len,
                                    log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath,
                                 err_fpath)
    if not unique_kmers:
        logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath +
                       '. Skipping...')
        return

    logger.info('  Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len,
                                    log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(
            tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath,
            err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath,
                                      log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('  Analyzing assemblies correctness...')
    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    logger.info('    Downsampling k-mers...')
    ref_kmers, downsampled_kmers_fpath = downsample_kmers(
        tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath,
        err_fpath)
    for id, (contigs_fpath,
             kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        corr_len = None
        mis_len = None
        undef_len = None
        translocations, relocations = None, None
        total_len = 0
        contig_lens = dict()
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)

        if len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning(
                'Reference is too fragmented. Scaffolding accuracy will not be assessed.'
            )
        else:
            corr_len = 0
            mis_len = 0
            kmers_by_contig, kmers_pos_by_contig = align_kmers(
                tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath,
                qconfig.max_threads)
            is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
            cyclic_ref_lens = report.get_field(
                reporting.Fields.REFLEN) if is_cyclic else None
            translocations = 0
            relocations = 0
            with open(
                    join(
                        tmp_dirpath,
                        qutils.label_from_fpath_for_fname(contigs_fpath) +
                        '.misjoins.txt'), 'w') as out:
                for contig in kmers_by_contig.keys():
                    contig_markers = []
                    prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None
                    for pos, kmer in sorted(zip(kmers_pos_by_contig[contig],
                                                kmers_by_contig[contig]),
                                            key=lambda x: x[0]):
                        ref_chrom, ref_pos = ref_kmers[kmer]
                        if prev_pos and prev_chrom:
                            if prev_chrom == ref_chrom and abs(
                                    abs(pos - prev_pos) /
                                    abs(ref_pos - prev_ref_pos) - 1) <= 0.05:
                                marker = (pos, ref_pos, ref_chrom)
                            elif marker:
                                contig_markers.append(marker)
                                pos, ref_pos, ref_chrom, marker = None, None, None, None
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if marker:
                        contig_markers.append(marker)
                    prev_pos, prev_ref_pos, prev_chrom = None, None, None
                    is_misassembled = False
                    for marker in contig_markers:
                        pos, ref_pos, ref_chrom = marker
                        if prev_pos and prev_chrom:
                            if ref_chrom != prev_chrom:
                                translocations += 1
                                out.write(
                                    'Translocation in %s: %s %d | %s %d\n' %
                                    (contig, prev_chrom, prev_pos, ref_chrom,
                                     pos))
                                is_misassembled = True
                            elif _get_dist_inconstistency(
                                    pos, prev_pos, ref_pos, prev_ref_pos,
                                    cyclic_ref_lens) > EXT_RELOCATION_SIZE:
                                relocations += 1
                                out.write(
                                    'Relocation in %s: %d (%d) | %d (%d)\n' %
                                    (contig, prev_pos, prev_ref_pos, pos,
                                     ref_pos))
                                is_misassembled = True
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if is_misassembled:
                        mis_len += contig_lens[contig]
                    elif len(contig_markers) > 0:
                        corr_len += contig_lens[contig]
            undef_len = total_len - corr_len - mis_len
            report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                             '%.2f' % (corr_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                             '%.2f' % (mis_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                             '%.2f' % (undef_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                             translocations)
            report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations)
            report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                             translocations + relocations)

        create_kmc_stats_file(
            output_dir, contigs_fpath, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len,
            mis_len, undef_len, total_len, translocations, relocations)
    save_kmers(output_dir)
    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Exemplo n.º 19
0
def save_result(result, report, fname, ref_fpath, genome_size):
    region_misassemblies = result['region_misassemblies']
    misassemblies_by_ref = result['misassemblies_by_ref']
    misassembled_contigs = result['misassembled_contigs']
    misassembled_bases = result['misassembled_bases']
    misassembly_internal_overlap = result['misassembly_internal_overlap']
    unaligned = result['unaligned']
    partially_unaligned = result['partially_unaligned']
    partially_unaligned_bases = result['partially_unaligned_bases']
    fully_unaligned_bases = result['fully_unaligned_bases']
    ambiguous_contigs = result['ambiguous_contigs']
    ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases']
    SNPs = result['SNPs']
    indels_list = result['indels_list']
    aligned_ref_bases = result['aligned_ref_bases']
    aligned_assembly_bases = result['aligned_assembly_bases']
    half_unaligned_with_misassembly = result['half_unaligned_with_misassembly']

    report.add_field(reporting.Fields.MISLOCAL,
                     region_misassemblies.count(Misassembly.LOCAL))
    report.add_field(
        reporting.Fields.MISASSEMBL,
        region_misassemblies.count(Misassembly.RELOCATION) +
        region_misassemblies.count(Misassembly.INVERSION) +
        region_misassemblies.count(Misassembly.TRANSLOCATION) +
        region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases)
    report.add_field(reporting.Fields.MISINTERNALOVERLAP,
                     misassembly_internal_overlap)
    if qconfig.bed:
        report.add_field(reporting.Fields.STRUCT_VARIATIONS,
                         region_misassemblies.count(Misassembly.MATCHED_SV))
    if qconfig.large_genome:
        report.add_field(reporting.Fields.POTENTIAL_MGE,
                         region_misassemblies.count(Misassembly.POTENTIAL_MGE))
    report.add_field(reporting.Fields.UNALIGNED,
                     '%d + %d part' % (unaligned, partially_unaligned))
    report.add_field(reporting.Fields.UNALIGNEDBASES,
                     (fully_unaligned_bases + partially_unaligned_bases))
    report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs)
    report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES,
                     ambiguous_contigs_extra_bases)
    report.add_field(reporting.Fields.MISMATCHES, SNPs)
    # different types of indels:
    if indels_list is not None:
        report.add_field(reporting.Fields.INDELS, len(indels_list))
        report.add_field(reporting.Fields.INDELSBASES, sum(indels_list))
        report.add_field(
            reporting.Fields.MIS_SHORT_INDELS,
            len([i for i in indels_list
                 if i <= qconfig.SHORT_INDEL_THRESHOLD]))
        report.add_field(
            reporting.Fields.MIS_LONG_INDELS,
            len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD]))

    if aligned_ref_bases:
        genome_fraction = aligned_ref_bases * 100.0 / genome_size
        duplication_ratio = float(
            aligned_assembly_bases + misassembly_internal_overlap +
            ambiguous_contigs_extra_bases) / aligned_ref_bases
        report.add_field(reporting.Fields.MAPPEDGENOME,
                         '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO,
                         '%.3f' % duplication_ratio)
        report.add_field(
            reporting.Fields.SUBSERROR,
            "%.2f" % (float(SNPs) * 100000.0 / float(aligned_assembly_bases)))
        report.add_field(
            reporting.Fields.INDELSERROR,
            "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) *
                      100000.0 / float(aligned_assembly_bases)))

    # for misassemblies report:
    report.add_field(
        reporting.Fields.MIS_ALL_EXTENSIVE,
        region_misassemblies.count(Misassembly.RELOCATION) +
        region_misassemblies.count(Misassembly.INVERSION) +
        region_misassemblies.count(Misassembly.TRANSLOCATION) +
        region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MIS_RELOCATION,
                     region_misassemblies.count(Misassembly.RELOCATION))
    report.add_field(reporting.Fields.MIS_TRANSLOCATION,
                     region_misassemblies.count(Misassembly.TRANSLOCATION))
    report.add_field(reporting.Fields.MIS_INVERTION,
                     region_misassemblies.count(Misassembly.INVERSION))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS,
                     len(misassembled_contigs))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases)
    report.add_field(reporting.Fields.MIS_LOCAL,
                     region_misassemblies.count(Misassembly.LOCAL))
    # special case for separating contig and scaffold misassemblies
    report.add_field(
        reporting.Fields.SCF_MIS_ALL_EXTENSIVE,
        region_misassemblies.count(Misassembly.SCF_RELOCATION) +
        region_misassemblies.count(Misassembly.SCF_INVERSION) +
        region_misassemblies.count(Misassembly.SCF_TRANSLOCATION) +
        region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.SCF_MIS_RELOCATION,
                     region_misassemblies.count(Misassembly.SCF_RELOCATION))
    report.add_field(reporting.Fields.SCF_MIS_TRANSLOCATION,
                     region_misassemblies.count(Misassembly.SCF_TRANSLOCATION))
    report.add_field(reporting.Fields.SCF_MIS_INVERTION,
                     region_misassemblies.count(Misassembly.SCF_INVERSION))
    report.add_field(
        reporting.Fields.CTG_MIS_ALL_EXTENSIVE,
        report.get_field(reporting.Fields.MIS_ALL_EXTENSIVE) -
        report.get_field(reporting.Fields.SCF_MIS_ALL_EXTENSIVE))
    report.add_field(
        reporting.Fields.CTG_MIS_RELOCATION,
        region_misassemblies.count(Misassembly.RELOCATION) -
        region_misassemblies.count(Misassembly.SCF_RELOCATION))
    report.add_field(
        reporting.Fields.CTG_MIS_TRANSLOCATION,
        region_misassemblies.count(Misassembly.TRANSLOCATION) -
        region_misassemblies.count(Misassembly.SCF_TRANSLOCATION))
    report.add_field(
        reporting.Fields.CTG_MIS_INVERTION,
        region_misassemblies.count(Misassembly.INVERSION) -
        region_misassemblies.count(Misassembly.SCF_INVERSION))

    if qconfig.is_combined_ref:
        report.add_field(
            reporting.Fields.MIS_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.SCF_MIS_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.CTG_MIS_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION) -
            region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
        report.add_field(
            reporting.Fields.POSSIBLE_MISASSEMBLIES,
            region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        all_references = sorted(
            list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        for ref_name in all_references:
            subreport = reporting.get(fname, ref_name=ref_name)
            ref_misassemblies = misassemblies_by_ref[ref_name]
            subreport.add_field(
                reporting.Fields.MIS_ALL_EXTENSIVE,
                ref_misassemblies.count(Misassembly.RELOCATION) +
                ref_misassemblies.count(Misassembly.INVERSION) +
                ref_misassemblies.count(Misassembly.TRANSLOCATION) +
                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(
                reporting.Fields.MIS_RELOCATION,
                ref_misassemblies.count(Misassembly.RELOCATION))
            subreport.add_field(
                reporting.Fields.MIS_TRANSLOCATION,
                ref_misassemblies.count(Misassembly.TRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_INVERTION,
                                ref_misassemblies.count(Misassembly.INVERSION))
            subreport.add_field(
                reporting.Fields.MIS_ISTRANSLOCATIONS,
                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_LOCAL,
                                ref_misassemblies.count(Misassembly.LOCAL))
            subreport.add_field(
                reporting.Fields.POSSIBLE_MISASSEMBLIES,
                ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
            subreport.add_field(
                reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS,
                ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
            if fname not in qconfig.dict_of_broken_scaffolds:
                subreport.add_field(
                    reporting.Fields.MIS_SCAFFOLDS_GAP,
                    ref_misassemblies.count(Misassembly.SCAFFOLD_GAP))
                subreport.add_field(
                    reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP,
                    ref_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP))
            if qconfig.check_for_fragmented_ref:
                subreport.add_field(
                    reporting.Fields.MIS_FRAGMENTED,
                    ref_misassemblies.count(Misassembly.FRAGMENTED))
    elif intergenomic_misassemblies_by_asm:
        label = qutils.label_from_fpath(fname)
        ref_name = qutils.name_from_fpath(ref_fpath)
        ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name]
        report.add_field(
            reporting.Fields.MIS_ISTRANSLOCATIONS,
            ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.POSSIBLE_MISASSEMBLIES,
            ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        report.add_field(
            reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS,
            ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
    if fname not in qconfig.dict_of_broken_scaffolds:
        report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP,
                         region_misassemblies.count(Misassembly.SCAFFOLD_GAP))
        report.add_field(
            reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP,
            region_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP))
    if qconfig.check_for_fragmented_ref:
        report.add_field(reporting.Fields.MIS_FRAGMENTED,
                         region_misassemblies.count(Misassembly.FRAGMENTED))
    # for unaligned report:
    report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned)
    report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH,
                     fully_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS,
                     partially_unaligned)
    report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH,
                     partially_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS,
                     half_unaligned_with_misassembly)
    return report
Exemplo n.º 20
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    icarus_gc_fpath = None
    circos_gc_fpath = None
    if ref_fpath:
        reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath)
        if qconfig.create_icarus_html or qconfig.draw_plots:
            icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt')
            save_icarus_GC(ref_fpath, icarus_gc_fpath)
        if qconfig.draw_plots:
            circos_gc_fpath = join(output_dirpath, 'gc.circos.txt')
            save_circos_GC(ref_fpath, reference_length, circos_gc_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) +
                    ', num fragments = ' + str(reference_fragments) + ', GC % = ' +
                    '%.2f' % reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning('  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                           ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).')
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points)
                                  if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)])
                                 if (i * multiplicator) < len(reference_lengths) else
                                 sum(reference_lengths[((i - 1) * multiplicator):])
                                 for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution,
                                                join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath)

    logger.main_info('Done.')
    return icarus_gc_fpath, circos_gc_fpath
Exemplo n.º 21
0
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath):
    from quast_libs import reporting

    ref_reads_stats = None
    ref_lap_score = None
    if ref_fpath:
        ref_name = qutils.name_from_fpath(ref_fpath)
        stats_fpath = join(output_dir, ref_name + '.stat')
        if isfile(stats_fpath):
            ref_reads_stats = parse_reads_stats(stats_fpath)
            if int(ref_reads_stats['mapped']) == 0:
                logger.info('  BWA: nothing aligned for reference.')
        lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out')
        if is_non_empty_file(lap_out_fpath):
            with open(lap_out_fpath) as f:
                l = f.readline()
                ref_lap_score = float(l.split()[0]) if l else None

    # process all contigs files
    for index, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        stats_fpath = join(output_dir, assembly_name + '.stat')
        if ref_reads_stats:
            report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped'])
            report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt'])
            report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons'])
            report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt'])
            report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth'])
            if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
                report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS,
                                [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
                report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0])
        if not isfile(stats_fpath):
            continue
        reads_stats = parse_reads_stats(stats_fpath)
        report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total'])
        report.add_field(reporting.Fields.LEFT_READS, reads_stats['left'])
        report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right'])
        report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped'])
        report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt'])
        if int(reads_stats['mapped']) == 0:
            logger.info('  ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.')
        report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons'])
        report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt'])
        report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint'])
        report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt'])
        report.add_field(reporting.Fields.DEPTH, reads_stats['depth'])
        if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
            report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS,
                            [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
            report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])

        lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out')
        if is_non_empty_file(lap_out_fpath):
            with open(lap_out_fpath) as f:
                l = f.readline()
                lap_score = float(l.split()[0]) if l else None
            report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None))
        report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))
Exemplo n.º 22
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None

    num_nf_errors = logger._num_nf_errors
    create_minimap_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)

    genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(reference, skip_ns=True)
    threads = qconfig.max_threads if qconfig.memory_efficient else threads
    args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes,
            old_contigs_fpath, bed_fpath, threads)
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))]
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(align_and_analyze, args, n_jobs)
    reports = []

    aligner_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if AlignerStatus.OK in aligner_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == AlignerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference, genome_size))
        elif statuses[index] == AlignerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if AlignerStatus.OK in aligner_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(aligner_statuses.values()).count(AlignerStatus.OK)
    not_aligned = list(aligner_statuses.values()).count(AlignerStatus.NOT_ALIGNED)
    failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED)
    errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(aligner_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')

    return aligner_statuses, aligned_lengths_per_fpath
Exemplo n.º 23
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
    reference_length = sum(ref_chr_lengths.values())
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()))

    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
                 ', LA50 = ' + str(la50) +
                 (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))])

    if json_output_dirpath:
        from quast_libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx',
                    assembly_lengths, json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists,
                        aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Exemplo n.º 24
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 0.001

    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    ref_kmc_dbs = []

    if len(ref_contigs) <= MAX_REF_CONTIGS_NUM:
        shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)
        for name, seq in read_fasta(ref_fpath):
            seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True,
                                                     intersect_with=shared_downsampled_kmc_db)
            ref_kmc_dbs.append((name, seq_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        list_files_fpath = join(tmp_dirpath, label + '_files.txt')
        with open(list_files_fpath, 'w') as list_files:
            for name, seq in read_fasta(contigs_fpath):
                total_len += len(seq)
                contig_lens[name] = len(seq)
                if len(seq) >= MIN_CONTIGS_LEN:
                    long_contigs.append(len(seq))
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    with open(tmp_contig_fpath, 'w') as out_f:
                        out_f.write('>%s\n' % name)
                        out_f.write('%s\n' % seq)
                    list_files.write(tmp_contig_fpath + '\n')

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta')
            filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS)
            filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt')
            with open(filtered_list_files_fpath, 'w') as list_files:
                for name, _ in read_fasta(filtered_fpath):
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    list_files.write(tmp_contig_fpath + '\n')
            for ref_name, ref_kmc_db in ref_kmc_dbs:
                tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta')
                filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS)
                if exists(tmp_filtered_fpath):
                    for name, _ in read_fasta(tmp_filtered_fpath):
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Exemplo n.º 25
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger,
                                is_prokaryote=qconfig.prokaryote,
                                is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    config_fpath = make_config(output_dir, tmp_dir, busco_threads,
                               clade_dirpath, augustus_dirpath)
    logger.info('Logs and results will be saved under ' + output_dir + '...')

    os.environ['BUSCO_CONFIG_FILE'] = config_fpath
    os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_contigs(
        augustus_dirpath, tmp_dir)
    if not os.environ['AUGUSTUS_CONFIG_PATH']:
        logger.error(
            'Augustus configs not found, failed to run BUSCO without them.')
    busco_args = [[
        contigs_fpath,
        qutils.label_from_fpath_for_fname(contigs_fpath)
    ] for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco_main_handler, busco_args,
                                  qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error(
            'Failed running BUSCO for all the assemblies. See log files in ' +
            output_dir + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
            shutil.copy(summary_fpaths[i], output_dir)
        else:
            logger.error('Failed running BUSCO for ' + contigs_fpath +
                         '. See the log for detailed information.')
    if not qconfig.debug:
        cleanup(output_dir)
    logger.info('Done.')
Exemplo n.º 26
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    num_nf_errors = logger._num_nf_errors
    success_compilation = compile_aligner(logger)
    if qconfig.test and is_emem_aligner():
        success_compilation = check_emem_functionality(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \
                                         [x[1] for x in statuses_results_lengths_tuples], \
                                         [x[2] for x in statuses_results_lengths_tuples]
    reports = []

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        if qconfig.draw_plots:
            from . import plotter
            plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        if not qconfig.test and is_emem_aligner():
            logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.')

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 27
0
Arquivo: quast.py Projeto: ablab/quast
def main(args):
    check_dirpath(
        qconfig.QUAST_HOME,
        "You are trying to run it from "
        + str(qconfig.QUAST_HOME)
        + "\n."
        + "Please, put QUAST in a different directory, then try again.\n",
        exit_code=3,
    )

    if not args:
        qconfig.usage()
        sys.exit(0)

    try:
        import imp

        imp.reload(qconfig)
    except:
        reload(qconfig)

    try:
        locale.setlocale(locale.LC_ALL, "en_US.utf8")
    except Exception:
        try:
            locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
        except Exception:
            logger.warning("Python locale settings can't be changed")
    quast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args)
    output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    logger.main_info()
    logger.print_params()

    ########################################################################
    from quast_libs import reporting

    reports = reporting.reports
    try:
        import imp

        imp.reload(reporting)
    except:
        reload(reporting)
    reporting.reports = reports
    reporting.assembly_fpaths = []
    from quast_libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, "..", qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    qconfig.set_max_threads(logger)
    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info("Reference:")
        ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ""

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info("Contigs:")

    contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    reads_fpaths = []
    cov_fpath = qconfig.cov_fpath
    physical_cov_fpath = qconfig.phys_cov_fpath
    if qconfig.forward_reads:
        reads_fpaths.append(qconfig.forward_reads)
    if qconfig.reverse_reads:
        reads_fpaths.append(qconfig.reverse_reads)
    if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpath:
        bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(
            ref_fpath,
            contigs_fpaths,
            reads_fpaths,
            None,
            os.path.join(output_dirpath, qconfig.variation_dirname),
            external_logger=logger,
            sam_fpath=qconfig.sam,
            bam_fpath=qconfig.bam,
            bed_fpath=qconfig.bed,
        )
        qconfig.bed = bed_fpath

    if not contigs_fpaths:
        logger.error(
            "None of the assembly files contains correct contigs. "
            "Please, provide different files or decrease --min-contig threshold.",
            fake_if_nested_run=True,
        )
        return 4

    if qconfig.used_colors and qconfig.used_ls:
        for i, label in enumerate(labels):
            plotter.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i])

    qconfig.assemblies_fpaths = contigs_fpaths

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots and plotter.can_draw_plots:
        try:
            from matplotlib.backends.backend_pdf import PdfPages

            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    if qconfig.json_output_dirpath:
        from quast_libs.html_saver import json_saver

        if json_saver.simplejson_error:
            json_output_dirpath = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from quast_libs import basic_stats

    basic_stats.do(
        ref_fpath,
        contigs_fpaths,
        os.path.join(output_dirpath, "basic_stats"),
        qconfig.json_output_dirpath,
        output_dirpath,
    )

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    icarus_html_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from quast_libs import contigs_analyzer

        is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath,
            contigs_fpaths,
            is_cyclic,
            os.path.join(output_dirpath, "contigs_reports"),
            old_contigs_fpaths,
            qconfig.bed,
        )
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    features_containers = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, "contigs_reports")

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from quast_libs import aligned_stats

        aligned_stats.do(
            ref_fpath,
            aligned_contigs_fpaths,
            output_dirpath,
            qconfig.json_output_dirpath,
            aligned_lengths_lists,
            os.path.join(output_dirpath, "aligned_stats"),
        )

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from quast_libs import genome_analyzer

        features_containers = genome_analyzer.do(
            ref_fpath,
            aligned_contigs_fpaths,
            output_dirpath,
            qconfig.json_output_dirpath,
            qconfig.genes,
            qconfig.operons,
            detailed_contigs_reports_dirpath,
            os.path.join(output_dirpath, "genome_stats"),
        )

    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from quast_libs import gage

            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    genes_by_labels = None
    if qconfig.gene_finding:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from quast_libs import glimmer

            genes_by_labels = glimmer.do(
                contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, "predicted_genes")
            )
        else:
            ########################################################################
            ### GeneMark
            ########################################################################
            from quast_libs import genemark

            genes_by_labels = genemark.do(
                contigs_fpaths,
                qconfig.genes_lengths,
                os.path.join(output_dirpath, "predicted_genes"),
                qconfig.prokaryote,
                qconfig.meta,
            )

    else:
        logger.main_info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots or qconfig.create_icarus_html:
        logger.print_timestamp()
        logger.main_info("Creating large visual summaries...")
        logger.main_info("This may take a while: press Ctrl-C to skip this step..")
        try:
            if detailed_contigs_reports_dirpath:
                report_for_icarus_fpath_pattern = os.path.join(
                    detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern
                )
                stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern)
            else:
                report_for_icarus_fpath_pattern = None
                stdout_pattern = None
            draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html
            number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, all_pdf_file]])
            if draw_alignment_plots:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info("  1 of %d: Creating Icarus viewers..." % number_of_steps)
                from quast_libs import icarus

                icarus_html_fpath, contig_alignment_plot_fpath = icarus.do(
                    contigs_fpaths,
                    report_for_icarus_fpath_pattern,
                    output_dirpath,
                    ref_fpath,
                    stdout_pattern=stdout_pattern,
                    features=features_containers,
                    cov_fpath=cov_fpath,
                    physical_cov_fpath=physical_cov_fpath,
                    json_output_dir=qconfig.json_output_dirpath,
                    genes_by_labels=genes_by_labels,
                )

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.main_info(
                    "  %d of %d: Creating PDF with all tables and plots..." % (number_of_steps, number_of_steps)
                )
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.main_info("Done")
        except KeyboardInterrupt:
            logger.main_info("..step skipped!")
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info("RESULTS:")
    logger.main_info("  Text versions of total report are saved to " + reports_fpaths)
    logger.main_info("  Text versions of transposed total report are saved to " + transposed_reports_fpaths)

    if qconfig.json_output_dirpath:
        json_saver.save_total_report(qconfig.json_output_dirpath, qconfig.min_contig, ref_fpath)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver

        html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath)

    if os.path.isfile(all_pdf_fpath):
        logger.main_info("  PDF version (tables and plots) is saved to " + all_pdf_fpath)

    if icarus_html_fpath:
        logger.main_info("  Icarus (contig browser) is saved to %s" % icarus_html_fpath)

    if qconfig.draw_svg and contig_alignment_plot_fpath:
        logger.main_info("  Contig alignment plot is saved to %s" % contig_alignment_plot_fpath)

    cleanup(corrected_dirpath)
    return logger.finish_up(check_test=qconfig.test)
Exemplo n.º 28
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique 101-mers...')
    addsitedir(jellyfish_python_dirpath)
    try:
        compile_jellyfish(logger)
        import jellyfish
        try:
            import imp
            imp.reload(jellyfish)
        except:
            reload(jellyfish)
        jellyfish.MerDNA.k(KMERS_LEN)
    except:
        logger.warning('Failed unique 101-mers analysis.')
        return

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths,
                                     ref_fpath):
            jf_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(jf_stats_fpath).read().split('\n')
            if len(stats_content) < 4:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                '%.2f' % float(stats_content[1].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                '%.2f' % float(stats_content[2].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                '%.2f' % float(stats_content[3].strip().split(': ')[-1]))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    logger.info('Running Jellyfish on reference...')
    jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf')
    qutils.call_subprocess([
        jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
        str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t',
        str(qconfig.max_threads), ref_fpath
    ])
    ref_kmers = jellyfish.ReadMerFile(jf_out_fpath)
    os.remove(jf_out_fpath)

    logger.info('Running Jellyfish on assemblies...')
    contigs_kmers = []
    for contigs_fpath in contigs_fpaths:
        jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf')
        qutils.call_subprocess([
            jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
            str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t',
            str(qconfig.max_threads), contigs_fpath
        ])
        contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath))
        os.remove(jf_out_fpath)

    logger.info('Analyzing completeness and accuracy of assemblies...')
    unique_kmers = 0
    matched_kmers = defaultdict(int)
    shared_kmers = set()
    kmer_i = 0
    for kmer, count in ref_kmers:
        unique_kmers += 1
        matches = 0
        for idx in range(len(contigs_fpaths)):
            if contigs_kmers[idx][kmer]:
                matched_kmers[idx] += 1
                matches += 1
        if matches == len(contigs_fpaths):
            if kmer_i % 100 == 0:
                shared_kmers.add(str(kmer))
            kmer_i += 1

    for idx, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        completeness = matched_kmers[idx] * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)

    shared_kmers_by_chrom = dict()
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    for name, seq in ref_contigs.items():
        seq_kmers = jellyfish.string_mers(seq)
        for kmer in seq_kmers:
            if str(kmer) in shared_kmers:
                shared_kmers_by_chrom[str(kmer)] = name

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = 0
        len_map_to_multi_chrom = 0
        total_len = 0

        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            seq_kmers = jellyfish.string_mers(seq)
            chrom_markers = []
            for kmer in seq_kmers:
                kmer_str = str(kmer)
                if kmer_str in shared_kmers_by_chrom:
                    chrom = shared_kmers_by_chrom[kmer_str]
                    chrom_markers.append(chrom)
            if len(chrom_markers) < MIN_MARKERS:
                continue
            if len(set(chrom_markers)) == 1:
                len_map_to_one_chrom += len(seq)
            else:
                len_map_to_multi_chrom += len(seq)

        len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                         '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                         '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                         '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_jf_stats_file(
            output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS),
            len_map_to_one_chrom, len_map_to_multi_chrom,
            len_map_to_none_chrom)

    logger.info('Done.')
Exemplo n.º 29
0
def main(args):
    check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n.' +
                  'Please, put QUAST in a different directory, then try again.\n', exit_code=3)

    if not args:
        qconfig.usage(stream=sys.stderr)
        sys.exit(1)

    try:
        import imp
        imp.reload(qconfig)
        imp.reload(qutils)
    except:
        reload(qconfig)
        reload(qutils)

    try:
        locale.setlocale(locale.LC_ALL, 'en_US.utf8')
    except Exception:
        try:
            locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
        except Exception:
            logger.warning('Python locale settings can\'t be changed')
    quast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args)
    output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    logger.main_info()
    logger.print_params()

    ########################################################################
    from quast_libs import reporting
    reports = reporting.reports
    try:
        import imp
        imp.reload(reporting)
    except:
        reload(reporting)
    reporting.reports = reports
    reporting.assembly_fpaths = []
    from quast_libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    qconfig.set_max_threads(logger)
    check_reads_fpaths(logger)
    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        original_ref_fpath = ref_fpath
        ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath)
        if qconfig.ideal_assembly:
            ideal_assembly_fpath = ideal_assembly.do(ref_fpath, original_ref_fpath,
                                                     os.path.join(output_dirpath, qconfig.ideal_assembly_basename))
            if ideal_assembly_fpath is not None:
                contigs_fpaths.insert(0, ideal_assembly_fpath)
                labels.insert(0, 'IDEAL ASSEMBLY')
                labels = qutils.process_labels(contigs_fpaths, labels)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    cov_fpath = qconfig.cov_fpath
    physical_cov_fpath = qconfig.phys_cov_fpath
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_sam or qconfig.sam_fpaths or qconfig.bam_fpaths:
        bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths,
                                                                     os.path.join(output_dirpath, qconfig.reads_stats_dirname),
                                                                     external_logger=logger)
        qconfig.bed = bed_fpath

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.used_colors and qconfig.used_ls:
        for i, label in enumerate(labels):
            plotter_data.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i])

    qconfig.assemblies_fpaths = contigs_fpaths

    # Where all pdfs will be saved
    all_pdf_fpath = None
    if qconfig.draw_plots and plotter.can_draw_plots:
        all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)

    if qconfig.json_output_dirpath:
        from quast_libs.html_saver import json_saver
        if json_saver.simplejson_error:
            qconfig.json_output_dirpath = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from quast_libs import basic_stats
    icarus_gc_fpath, circos_gc_fpath = basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), output_dirpath)

    if qconfig.large_genome and ref_fpath:
        unique_kmers.do(os.path.join(output_dirpath, 'basic_stats'), ref_fpath, contigs_fpaths, logger)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    icarus_html_fpath = None
    circos_png_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from quast_libs import contigs_analyzer
        is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, 'contigs_reports'),
            old_contigs_fpaths, qconfig.bed)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    features_containers = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from quast_libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from quast_libs import genome_analyzer
        features_containers = genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath,
            qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath,
            os.path.join(output_dirpath, 'genome_stats'))

    genes_by_labels = None
    if qconfig.gene_finding:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from quast_libs import glimmer
            genes_by_labels = glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
        if not qconfig.glimmer or qconfig.test:
            ########################################################################
            ### GeneMark
            ########################################################################
            from quast_libs import genemark
            genes_by_labels = genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.prokaryote, qconfig.metagenemark)
    else:
        logger.main_info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")

    if qconfig.rna_gene_finding:
        run_barrnap.do(contigs_fpaths, os.path.join(output_dirpath, 'predicted_genes'), logger)

    if qconfig.run_busco and not qconfig.is_combined_ref:
        if qconfig.platform_name == 'macosx':
            logger.main_info("")
            logger.warning("BUSCO can be run on Linux only")
        elif sys.version[0:3] == '2.5':
            logger.main_info("")
            logger.warning("BUSCO does not support Python versions older than 2.6.")
        else:
            from quast_libs import run_busco
            run_busco.do(contigs_fpaths, os.path.join(output_dirpath, qconfig.busco_dirname), logger)
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots or qconfig.create_icarus_html:
        logger.print_timestamp()
        logger.main_info('Creating large visual summaries...')
        logger.main_info('This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath:
                report_for_icarus_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern)
                stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern)
            else:
                report_for_icarus_fpath_pattern = None
                stdout_pattern = None
            draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html
            draw_circos_plot = qconfig.draw_plots and ref_fpath and len(aligned_contigs_fpaths) and not qconfig.space_efficient
            number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, draw_circos_plot, all_pdf_fpath]])
            if draw_alignment_plots:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info('  1 of %d: Creating Icarus viewers...' % number_of_steps)
                from quast_libs import icarus
                icarus_html_fpath, contig_alignment_plot_fpath = icarus.do(
                    contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath,
                    stdout_pattern=stdout_pattern, features=features_containers,
                    cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, gc_fpath=icarus_gc_fpath,
                    json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels)

            if draw_circos_plot:
                logger.main_info('  %d of %d: Creating Circos plots...' % (2 if draw_alignment_plots else 1, number_of_steps))
                from quast_libs import circos
                circos_png_fpath, circos_legend_fpath = circos.do(ref_fpath, contigs_fpaths, report_for_icarus_fpath_pattern, circos_gc_fpath,
                                                                  features_containers, cov_fpath, os.path.join(output_dirpath, 'circos'), logger)

            if all_pdf_fpath:
                # full report in PDF format: all tables and plots
                logger.main_info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_fpath)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            if all_pdf_fpath and os.path.isfile(all_pdf_fpath):
                os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.main_info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath)

    if all_pdf_fpath and os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) is saved to ' + all_pdf_fpath)

    if circos_png_fpath:
        logger.main_info('  Circos plot is saved to %s (the annotation is in %s). Circos configuration file is saved to %s' %
                         (circos_png_fpath, circos_legend_fpath, circos_png_fpath.replace('.png', '.conf')))

    if icarus_html_fpath:
        logger.main_info('  Icarus (contig browser) is saved to %s' % icarus_html_fpath)

    if qconfig.draw_svg and contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot is saved to %s' % contig_alignment_plot_fpath)

    cleanup(corrected_dirpath)
    return logger.finish_up(check_test=qconfig.test)
Exemplo n.º 30
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    set_augustus_dir(augustus_dirpath)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    log_fpath = join(output_dir, 'busco.log')
    logger.info('Logging to ' + log_fpath + '...')
    busco_args = [(['-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath,
                    '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir,
                    '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir)
                    for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos)))
        else:
            logger.error(
                'Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.')
    logger.info('Done.')
Exemplo n.º 31
0
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels,
                contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None,
                stdout_pattern=None, features_data=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None):
    chr_names = []
    if chromosomes_length and assemblies:
        chr_to_aligned_blocks = OrderedDict()
        chr_names = list(chromosomes_length.keys())
        for assembly in assemblies.assemblies:
            chr_to_aligned_blocks[assembly.label] = defaultdict(list)
            similar_correct = 0
            similar_misassembled = 0

            for align in assembly.alignments:
                chr_to_aligned_blocks[assembly.label][align.ref_name].append(align)
                if align.similar:
                    if align.misassembled:
                        similar_misassembled += 1
                    else:
                        similar_correct += 1
            report = reporting.get(assembly.fpath)
            report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct)
            report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled)

    main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname)
    output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)

    chr_full_names, contig_names_by_refs = group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath)

    cov_data, not_covered, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs)
    physical_cov_data, not_covered, physical_max_depth = parse_cov_fpath(physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs)

    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    nx_marks = [reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75]

    assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks)

    ref_contigs_dict = {}
    chr_lengths_dict = {}

    ref_data = 'var references_by_id = {};\n'
    chr_names_by_id = dict((chrom, str(i)) for i, chrom in enumerate(chr_names))
    for chrom, i in chr_names_by_id.items():
        ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n'
    for i, chr in enumerate(chr_full_names):
        if contig_names_by_refs:
            ref_contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
        elif len(chr_full_names) == 1:
            ref_contigs = chr_names
        else:
            ref_contigs = [chr]
        ref_contigs_dict[chr] = ref_contigs
        chr_lengths_dict[chr] = [0] + [chromosomes_length[contig] for contig in ref_contigs]

    num_misassemblies = defaultdict(int)
    aligned_bases_by_chr = defaultdict(list)
    aligned_assemblies = defaultdict(set)
    for i, chr in enumerate(chr_full_names):
        ref_contigs = ref_contigs_dict[chr]
        chr_lengths = chr_lengths_dict[chr]
        chr_size = sum([chromosomes_length[contig] for contig in ref_contigs])
        chr_sizes[chr] = chr_size
        num_contigs[chr] = len(ref_contigs)
        data_str = []
        data_str.append('var chromosomes_len = {};')
        for ref_contig in ref_contigs:
            l = chromosomes_length[ref_contig]
            data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';')
            aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig])

        cov_data_str = format_cov_data(cov_data, max_depth, chr, 'coverage_data', 'reads_max_depth') if cov_data else None
        physical_cov_data_str = format_cov_data(physical_cov_data, physical_max_depth, chr, 'physical_coverage_data', 'physical_max_depth') \
            if physical_cov_data else None

        alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \
            prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels,
                                               contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
                                               cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str,
                                               contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path)
        ref_name = qutils.name_from_fpath(ref_fpath)
        save_alignment_data_for_one_ref(chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors,
                                        ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data,
                                        contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data)

    contigs_sizes_str, too_many_contigs = get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels,
                                                           contig_names_by_refs, chr_names, chr_full_names)
    all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str
    save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data)

    icarus_links = defaultdict(list)
    if len(chr_full_names) > 1:
        chr_link = qconfig.icarus_html_fname
        icarus_links["links"].append(chr_link)
        icarus_links["links_names"].append(qconfig.icarus_link)

    main_menu_template_fpath = html_saver.get_real_path(qconfig.icarus_menu_template_fname)
    main_data_dict = dict()

    labels = [qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths]
    main_data_dict['assemblies'] = labels
    html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies')

    contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname)
    main_data_dict['contig_size_html'] = contig_size_browser_fpath
    html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html')
    if not chr_names:
        icarus_links["links"].append(contig_size_browser_fpath)
        icarus_links["links_names"].append(qconfig.icarus_link)

    if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref):
        main_data_dict['table_references'] = {'references': []}
        num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names]
        is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
        if is_unaligned_asm_exists:
            main_data_dict['table_references']['th_assemblies'] = True
        for chr in sorted(chr_full_names):
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths,
                                                                                contig_names_by_refs, one_chromosome=len(chr_full_names) == 1)
            reference_dict = dict()
            reference_dict['chr_link'] = chr_link
            reference_dict['tooltip'] = tooltip
            reference_dict['chr_name'] = os.path.basename(chr_name)
            reference_dict['num_contigs'] = str(num_contigs[chr])
            reference_dict['chr_size'] = format_long_numbers(chr_size)
            if is_unaligned_asm_exists:
                reference_dict['num_assemblies'] = str(len(aligned_assemblies[chr]))
            reference_dict['chr_gf'] = '%.3f' % chr_genome
            reference_dict['num_misassemblies'] = str(num_misassemblies[chr])
            main_data_dict['table_references']['references'].append(reference_dict)
        html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False)
    else:
        if chr_full_names:
            chr = chr_full_names[0]
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths,
                                                                                contig_names_by_refs, one_chromosome=True)
            main_data_dict['one_reference'] = dict()
            main_data_dict['one_reference']['alignment_link'] = chr_link
            main_data_dict['one_reference']['ref_fpath'] = os.path.basename(ref_fpath)
            main_data_dict['one_reference']['ref_fragments'] = str(num_contigs[chr])
            main_data_dict['one_reference']['ref_size'] = format_long_numbers(chr_size)
            main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome
            main_data_dict['one_reference']['num_misassemblies'] = str(num_misassemblies[chr])
            icarus_links["links"].append(chr_link)
            icarus_links["links_names"].append(qconfig.icarus_link)
            html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False)
    html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict)

    html_saver.save_icarus_links(output_dirpath, icarus_links)
    if json_output_dir:
        json_saver.save_icarus_links(json_output_dir, icarus_links)

    return main_menu_fpath
Exemplo n.º 32
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning(
            "GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname,
                                qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning(
            '  Sorry, can\'t use %s on this platform, skipping gene prediction.'
            % tool_name)
    elif not install_genemark():
        logger.warning(
            '  Can\'t copy the license key to ~/.gm_key, skipping gene prediction.'
        )
    else:
        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib2 import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        if not qconfig.memory_efficient:
            results = Parallel(n_jobs=n_jobs)(
                delayed(predict_genes)
                (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath,
                 tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
                for index, fasta_fpath in enumerate(fasta_fpaths))
        else:
            results = [
                predict_genes(index, fasta_fpath, gene_lengths, out_dirpath,
                              tool_dirpath, tmp_dirpath, gmhmm_p_function,
                              prokaryote, num_threads)
                for index, fasta_fpath in enumerate(fasta_fpaths)
            ]

        if not is_license_valid(out_dirpath, fasta_fpaths):
            return

        genes_by_labels = dict()
        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            label = qutils.label_from_fpath(fasta_path)
            genes_by_labels[
                label], unique_count, full_genes, partial_genes = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                                 unique_count)
            if full_genes is not None:
                genes = [
                    '%s + %s part' % (full_cnt, partial_cnt)
                    for full_cnt, partial_cnt in zip(full_genes, partial_genes)
                ]
                report.add_field(reporting.Fields.PREDICTED_GENES, genes)
            if unique_count is None and full_genes is None:
                logger.error(
                    '  ' + qutils.index_to_str(i) +
                    'Failed predicting genes in ' + label + '. ' +
                    ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                     if tool_name == 'GeneMark-ES'
                     and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            for dirpath in glob.iglob(tmp_dirpath + '*'):
                if os.path.isdir(dirpath):
                    shutil.rmtree(dirpath)

        logger.main_info('Done.')
        return genes_by_labels
Exemplo n.º 33
0
def do(reference,
       contigs_fpaths,
       is_cyclic,
       output_dir,
       old_contigs_fpaths,
       bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )
        return dict(
            zip(contigs_fpaths,
                [AlignerStatus.FAILED] * len(contigs_fpaths))), None

    num_nf_errors = logger._num_nf_errors
    create_minimap_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed

    genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(
        reference, skip_ns=True)
    threads = qconfig.max_threads if qconfig.memory_efficient else threads
    args = [(is_cyclic, i, contigs_fpath, output_dir, reference,
             reference_chromosomes, ns_by_chromosomes, old_contigs_fpath,
             bed_fpath, threads)
            for i, (contigs_fpath, old_contigs_fpath
                    ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))]
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(
        align_and_analyze, args, n_jobs)
    reports = []

    aligner_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(
        zip(contigs_fpaths, aligned_lengths_by_contigs))

    if AlignerStatus.OK in aligner_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths,
                                    ref_labels_by_chromosomes, output_dir,
                                    logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == AlignerStatus.OK:
            reports.append(
                save_result(results[index], report, fname, reference,
                            genome_size))
        elif statuses[index] == AlignerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if AlignerStatus.OK in aligner_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(
                reports, join(output_dir, 'misassemblies_plot'),
                'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict(
                (contigs_fpaths[i], misassemblies_in_contigs[i])
                for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths,
                             misc.contigs_aligned_lengths,
                             misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'),
                             'misassemblies')

    oks = list(aligner_statuses.values()).count(AlignerStatus.OK)
    not_aligned = list(aligner_statuses.values()).count(
        AlignerStatus.NOT_ALIGNED)
    failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED)
    errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(aligner_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info(
            'Done for ' + str(all - problems) + ' out of ' + str(all) +
            '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )

    return aligner_statuses, aligned_lengths_per_fpath
Exemplo n.º 34
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running GAGE...')

    metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases',
               'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs',
               'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp',
               'Indels >= 5', 'Inversions', 'Relocation', 'Translocation',
               'Total units', 'BasesInFasta', 'Min', 'Max', 'N50']
    metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, 
                            reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
                            reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, 
                            reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, 
                            reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, 
                            reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, 
                            reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, 
                            reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, 
                            reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    if not compile_aligner(logger) or (not all_required_java_classes_exist(gage_dirpath) and not compile_gage()):
        logger.error('GAGE module was not installed properly, so it is disabled and you cannot use --gage.')
        return

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.error('Error occurred while GAGE was processing assemblies.'
                     ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(
            gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Exemplo n.º 35
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath):
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000

    shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)

    shared_kmers_by_chrom = dict()
    shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt')
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    with open(shared_kmers_fpath, 'w') as out_f:
        for name, seq in ref_contigs.items():
            seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db)
            for kmer_i, kmer in enumerate(seq_kmers):
                shared_kmers_by_chrom[str(kmer)] = name
                out_f.write('>' + str(kmer_i) + '\n')
                out_f.write(kmer + '\n')

    shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath)
    ref_kmc_dbs = []
    for ref_name, ref_seq in ref_contigs.items():
        ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa')
        if not is_non_empty_file(ref_contig_fpath):
            with open(ref_contig_fpath, 'w') as out_f:
                out_f.write(ref_seq)
        ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath)
        ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath)
        ref_kmc_dbs.append((ref_name, ref_shared_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)
            if len(seq) >= MIN_CONTIGS_LEN:
                long_contigs.append(len(seq))

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            for name, seq in read_fasta(contigs_fpath):
                if len(seq) < MIN_CONTIGS_LEN:
                    continue

                tmp_contig_fpath = join(tmp_dirpath, name + '.fa')
                with open(tmp_contig_fpath, 'w') as out_tmp_f:
                    out_tmp_f.write(seq)
                contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath)
                intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath)
                kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath)
                if kmers_cnt < MIN_MARKERS:
                    continue
                for ref_name, ref_kmc_db in ref_kmc_dbs:
                    intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath)
                    kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath)
                    if kmers_cnt:
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Exemplo n.º 36
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.main_info('Done.')
    return [genes_container, operons_container]
Exemplo n.º 37
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running GAGE...')

    metrics = [
        'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size',
        'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases',
        'Missing Assembly Contigs', 'Duplicated Reference Bases',
        'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs',
        'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation',
        'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'
    ]
    metrics_in_reporting = [
        reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG,
        reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50,
        reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
        reporting.Fields.GAGE_CHAFFBASES,
        reporting.Fields.GAGE_MISSINGREFBASES,
        reporting.Fields.GAGE_MISSINGASMBLYBASES,
        reporting.Fields.GAGE_MISSINGASMBLYCONTIGS,
        reporting.Fields.GAGE_DUPREFBASES,
        reporting.Fields.GAGE_COMPRESSEDREFBASES,
        reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY,
        reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS,
        reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS,
        reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION,
        reporting.Fields.GAGE_NUMCORCONTIGS,
        reporting.Fields.GAGE_CORASMBLYSIZE,
        reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING,
        reporting.Fields.GAGE_CORN50
    ]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    if not compile_aligner(logger) or (
            not all_required_java_classes_exist(gage_dirpath)
            and not compile_gage()):
        logger.error(
            'GAGE module was not installed properly, so it is disabled and you cannot use --gage.'
        )
        return

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.memory_efficient:
        return_codes = Parallel(n_jobs=n_jobs)(
            delayed(run_gage)(i, contigs_fpath, gage_results_dirpath,
                              gage_tool_path, ref_fpath, tmp_dirpath)
            for i, contigs_fpath in enumerate(contigs_fpaths))
    else:
        return_codes = [
            run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path,
                     ref_fpath, tmp_dirpath)
            for i, contigs_fpath in enumerate(contigs_fpaths)
        ]

    if 0 not in return_codes:
        logger.error('Error occurred while GAGE was processing assemblies.'
                     ' See GAGE error logs for details: %s' %
                     os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(gage_results_dirpath,
                                     'gage_' + corr_assembly_label + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(
                        metrics_in_reporting[cur_metric_id],
                        line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id],
                                     line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Exemplo n.º 38
0
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    quast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args)
    output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
    logger.main_info()
    logger.print_params()

    ########################################################################
    from quast_libs import reporting
    reports = reporting.reports
    reload(reporting)
    reporting.reports = reports
    reporting.assembly_fpaths = []
    from quast_libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    reads_fpaths = []
    cov_fpath = []
    physical_cov_fpath = []
    if qconfig.forward_reads:
        reads_fpaths.append(qconfig.forward_reads)
    if qconfig.reverse_reads:
        reads_fpaths.append(qconfig.reverse_reads)
    if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpath:
        bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None,
                                                                     os.path.join(output_dirpath, qconfig.variation_dirname),
                                                                     external_logger=logger, sam_fpath=qconfig.sam, bam_fpath=qconfig.bam, bed_fpath=qconfig.bed)
        qconfig.bed = bed_fpath

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.used_colors and qconfig.used_ls:
        for i, label in enumerate(labels):
            plotter.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i])

    qconfig.assemblies_fpaths = contigs_fpaths
    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from quast_libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots and plotter.can_draw_plots:
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    if qconfig.json_output_dirpath:
        from quast_libs.html_saver import json_saver
        if json_saver.simplejson_error:
            json_output_dirpath = None


    ########################################################################
    ### Stats and plots
    ########################################################################
    from quast_libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   qconfig.json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    icarus_html_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from quast_libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'),
            old_contigs_fpaths, qconfig.bed)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    features_containers = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from quast_libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from quast_libs import genome_analyzer
        features_containers = genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.json_output_dirpath,
            qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath,
            os.path.join(output_dirpath, 'genome_stats'))

    genes_by_labels = None
    if qconfig.gene_finding:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from quast_libs import glimmer
            genes_by_labels = glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
        else:
            ########################################################################
            ### GeneMark
            ########################################################################
            from quast_libs import genemark
            genes_by_labels = genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.prokaryote, qconfig.meta)

    else:
        logger.main_info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots or qconfig.create_icarus_html:
        logger.print_timestamp()
        logger.main_info('Creating large visual summaries...')
        logger.main_info('This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath:
                report_for_icarus_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern)
                stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern)
            else:
                report_for_icarus_fpath_pattern = None
                stdout_pattern = None
            draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html
            number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, all_pdf_file]])
            if draw_alignment_plots:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info('  1 of %d: Creating Icarus viewers...' % number_of_steps)
                from quast_libs import icarus
                icarus_html_fpath, contig_alignment_plot_fpath = icarus.do(
                    contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath,
                    stdout_pattern=stdout_pattern, features=features_containers, cov_fpath=cov_fpath,
                    physical_cov_fpath=physical_cov_fpath, json_output_dir=qconfig.json_output_dirpath,
                    genes_by_labels=genes_by_labels)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.main_info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.main_info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if qconfig.json_output_dirpath:
        json_saver.save_total_report(qconfig.json_output_dirpath, qconfig.min_contig, ref_fpath)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath)

    if os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) is saved to ' + all_pdf_fpath)

    if icarus_html_fpath:
        logger.main_info('  Icarus (contig browser) is saved to %s' % icarus_html_fpath)

    if qconfig.draw_svg and contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot is saved to %s' % contig_alignment_plot_fpath)

    cleanup(corrected_dirpath)
    return logger.finish_up(check_test=qconfig.test)
Exemplo n.º 39
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning("GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning('  Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name)
    else:
        successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        genes_by_labels = dict()
        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            label = qutils.label_from_fpath(fasta_path)
            genes_by_labels[label], unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error('  ' + qutils.index_to_str(i) +
                     'Failed predicting genes in ' + label + '. ' +
                     ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                         if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            for dirpath in glob.iglob(tmp_dirpath + '*'):
                if os.path.isdir(dirpath):
                    shutil.rmtree(dirpath)

        logger.main_info('Done.')
        return genes_by_labels
Exemplo n.º 40
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if qconfig.test and is_emem_aligner():
        success_compilation = check_emem_functionality(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    if qconfig.draw_plots:
        compile_gnuplot(logger, only_clean=False)

    num_nf_errors = logger._num_nf_errors
    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref and not qconfig.memory_efficient:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)]
    reports = []

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if NucmerStatus.OK in nucmer_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        if not qconfig.test and is_emem_aligner():
            logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.')

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 41
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger,
                                is_prokaryote=qconfig.prokaryote,
                                is_fungus=qconfig.is_fungus)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    config_fpath = make_config(output_dir, tmp_dir, busco_threads,
                               clade_dirpath, augustus_dirpath)
    logger.info('Logs and results will be saved under ' + output_dir + '...')

    os.environ['BUSCO_CONFIG_FILE'] = config_fpath
    os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_configs(
        augustus_dirpath, tmp_dir)
    if not os.environ['AUGUSTUS_CONFIG_PATH']:
        logger.error(
            'Augustus configs not found, failed to run BUSCO without them.')
    busco_args = [[
        contigs_fpath,
        qutils.label_from_fpath_for_fname(contigs_fpath)
    ] for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco_main_handler, busco_args,
                                  qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error(
            'Failed running BUSCO for all the assemblies. See log files in ' +
            output_dir + ' for information '
            '(rerun with --debug to keep all intermediate files).')
        return

    # saving results
    zero_output_for_all = True
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
            if complete_buscos + part_buscos > 0:
                zero_output_for_all = False
            shutil.copy(summary_fpaths[i], output_dir)
        else:
            logger.error(
                'Failed running BUSCO for ' + contigs_fpath +
                '. See the log for detailed information'
                ' (rerun with --debug to keep all intermediate files).')
    if zero_output_for_all:
        logger.warning(
            'BUSCO did not fail explicitly but found nothing for all assemblies! '
            'Possible reasons and workarounds:\n'
            '  1. Provided assemblies are so small that they do not contain even a single partial BUSCO gene. Not likely but may happen -- nothing to worry then.\n'
            '  2. Incorrect lineage database was used. To run with fungi DB use --fungus, to run with eukaryota DB use --eukaryote, otherwise BUSCO uses bacteria DB.\n'
            '  3. Problem with BUSCO dependencies, most likely Augustus. Check that the binaries in '
            + augustus_dirpath + '/bin/ are working properly.\n'
            '     If something is wrong with Augustus, you may try to install it yourself (https://github.com/Gaius-Augustus/Augustus) and add "augustus" binary to PATH.\n'
            '  4. Some other problem with BUSCO. Check the logs (you may need to rerun QUAST with --debug to see all intermediate files).\n'
            '     If you cannot solve the problem yourself, post an issue at https://github.com/ablab/quast/issues or write to [email protected]'
        )
    if not qconfig.debug:
        cleanup(output_dir)
    logger.info('Done.')
Exemplo n.º 42
0
def do(contigs_fpaths, output_dir, logger):
    logger.print_timestamp()
    logger.info('Running BUSCO...')

    compilation_success = True

    augustus_dirpath = download_augustus(logger)
    if not augustus_dirpath:
        compilation_success = False
    elif not compile_tool('Augustus',
                          augustus_dirpath, [join('bin', 'augustus')],
                          logger=logger):
        compilation_success = False

    if compilation_success and not download_blast_binaries(
            logger=logger, filenames=blast_filenames):
        compilation_success = False

    if not compilation_success:
        logger.info('Failed finding conservative genes.')
        return

    set_augustus_dir(augustus_dirpath)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    tmp_dir = join(output_dir, 'tmp')
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    busco_threads = max(1, qconfig.max_threads // n_jobs)

    clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote)
    if not clade_dirpath:
        logger.info('Failed finding conservative genes.')
        return

    log_fpath = join(output_dir, 'busco.log')
    logger.info('Logging to ' + log_fpath + '...')
    busco_args = [([
        '-i', contigs_fpath, '-o',
        qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath,
        '-m', 'genome', '-f', '-z', '-c',
        str(busco_threads), '-t', tmp_dir,
        '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' +
        join(augustus_dirpath, 'config') + '\''
    ], output_dir) for contigs_fpath in contigs_fpaths]
    summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads)
    if not any(fpath for fpath in summary_fpaths):
        logger.error('Failed running BUSCO for all the assemblies. See ' +
                     log_fpath + ' for information.')
        return

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)

        if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]):
            total_buscos, part_buscos, complete_buscos = 0, 0, 0
            with open(summary_fpaths[i]) as f:
                for line in f:
                    if 'Complete BUSCOs' in line:
                        complete_buscos = int(line.split()[0])
                    elif 'Fragmented' in line:
                        part_buscos = int(line.split()[0])
                    elif 'Total' in line:
                        total_buscos = int(line.split()[0])
            if total_buscos != 0:
                report.add_field(
                    reporting.Fields.BUSCO_COMPLETE,
                    ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos)))
                report.add_field(reporting.Fields.BUSCO_PART,
                                 ('%.2f' %
                                  (float(part_buscos) * 100.0 / total_buscos)))
        else:
            logger.error('Failed running BUSCO for ' + contigs_fpath +
                         '. See ' + log_fpath + ' for information.')
    logger.info('Done.')
Exemplo n.º 43
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
    reference_length = sum(ref_chr_lengths.values())
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
                 ', LA50 = ' + str(la50) +
                 (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))])

    if json_output_dirpath:
        from quast_libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    import plotter
    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx',
                    assembly_lengths, json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists,
                        aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Exemplo n.º 44
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys()))

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.main_info('Done.')
    return [genes_container, operons_container]
Exemplo n.º 45
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    if ref_fpath:
        reference_lengths = sorted(
            fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(),
            reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(
            ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' +
                    str(reference_length) + ', num fragments = ' +
                    str(reference_fragments) + ', GC % = ' + '%.2f' %
                    reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning(
                '  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).'
            )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(
                    seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (
                        cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [
        sorted(list, reverse=True) for list in lists_of_lengths
    ]
    num_contigs = max(
        [len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[
            sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)])
            for i in range(1, max_points)
            if (i * multiplicator) < len(list_of_length)
        ] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [
                sum(reference_lengths[(
                    (i - 1) * multiplicator):(i * multiplicator)]) if
                (i * multiplicator) < len(reference_lengths) else sum(
                    reference_lengths[((i - 1) * multiplicator):])
                for i in range(1, max_points)
            ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(
                sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [
            sorted(list, reverse=True) for list in lists_of_lengths
        ]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(
            contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.GC,
                    ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT,
                ('%.2f' %
                 (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS,
                             reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.REFGC,
                    ('%.2f' %
                     reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil(
        (largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                list_of_GC_contigs_distributions,
                                reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                    contigs_fpaths, lists_of_lengths,
                    join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                        contigs_fpaths, lists_of_lengths,
                        join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                join(output_dirpath, 'cumulative_plot'),
                                'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                    list_of_GC_distributions_with_ref,
                                    join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(
                    contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(
                    contigs_fpath, GC_distribution,
                    join(
                        output_dirpath,
                        qutils.label_from_fpath(contigs_fpath) +
                        '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath]
               for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths,
                                     output_dirpath)

    logger.main_info('Done.')
Exemplo n.º 46
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict,
       operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    coords_dirpath = os.path.join(detailed_contigs_reports_dirpath,
                                  qconfig.minimap_output_dirname)
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        coords_dirpath = os.path.join(coords_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(
        ref_fpath)

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt')
    res_file = open(result_fpath, 'w')

    containers = []
    for feature, feature_fpath in features_dict.items():
        containers.append(FeatureContainer([feature_fpath], feature))
    if not features_dict:
        logger.notice(
            'No file with genomic features were provided. '
            'Use the --features option if you want to specify it.\n',
            indent='  ')
    if operons_fpaths:
        containers.append(FeatureContainer(operons_fpaths, 'operon'))
    else:
        logger.notice(
            'No file with operons were provided. '
            'Use the -O option if you want to specify it.',
            indent='  ')
    for container in containers:
        if not container.fpaths:
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(
                fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No genomic features of type "' + container.kind +
                           '" were loaded.',
                           indent='  ')
            res_file.write('Genomic features of type "' + container.kind +
                           '" loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) +
                        ' genomic features of type "' + container.kind + '"')
            res_file.write('Genomic features of type "' + container.kind +
                           '" loaded: ' + str(len(container.region_list)) +
                           '\n')
            container.chr_names_dict = chromosomes_names_dict(
                container.kind, container.region_list,
                list(reference_chromosomes.keys()))

    ref_genes_num, ref_operons_num = None, None
    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        genomic_features = 0
        for container in containers:
            if container.kind == 'operon':
                ref_operons_num = len(container.region_list)
                report.add_field(reporting.Fields.REF_OPERONS,
                                 len(container.region_list))
            else:
                genomic_features += len(container.region_list)
        if genomic_features:
            ref_genes_num = genomic_features
            report.add_field(reporting.Fields.REF_GENES, genomic_features)

    # for cumulative plots:
    files_features_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_unsorted_features_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}
    files_unsorted_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.memory_efficient:
        process_results = Parallel(n_jobs=n_jobs)(
            delayed(process_single_file)(
                contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                reference_chromosomes, ns_by_chromosomes, containers)
            for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    else:
        process_results = [
            process_single_file(contigs_fpath, index, coords_dirpath,
                                genome_stats_dirpath, reference_chromosomes,
                                ns_by_chromosomes, containers)
            for index, contigs_fpath in enumerate(aligned_contigs_fpaths)
        ]
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [
        process_results[i][1] for i in range(len(process_results))
    ]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [
            ref_lengths[i][ref] for i in range(len(ref_lengths))
        ]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) +
                       ' bp, ' + 'total length without N\'s: ' +
                       str(chr_len - len(ns_by_chromosomes[chr_name])) +
                       ' bp, maximal covered length: ' + str(aligned_len) +
                       ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' +
                   str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial',
         'operons', 'partial'))
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('=' * 120 + '\n')

    for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\
            in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_features_in_contigs[contigs_fpath] = features_in_contigs
        files_unsorted_features_in_contigs[
            contigs_fpath] = unsorted_features_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        files_unsorted_operons_in_contigs[
            contigs_fpath] = unsorted_operons_in_contigs
        full_found_genes.append(sum(features_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)

        res_file.write(
            '%-25s| %-10s| %-12s| %-10s|' %
            (assembly_name[:24], report.get_field(
                reporting.Fields.MAPPEDGENOME),
             report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count))

        genome_mapped.append(
            float(report.get_field(reporting.Fields.MAPPEDGENOME)))

        for (field, full,
             part) in [(reporting.Fields.GENES, genes_full, genes_part),
                       (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if ref_genes_num:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'features',
                                                files_features_in_contigs,
                                                ref_genes_num)
        if ref_operons_num:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        from quast_libs.ca_utils.misc import contigs_aligned_lengths
        if ref_genes_num:
            plotter.genes_operons_plot(
                ref_genes_num, aligned_contigs_fpaths,
                files_features_in_contigs,
                genome_stats_dirpath + '/features_cumulative_plot',
                'genomic features')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths,
                             contigs_aligned_lengths,
                             files_unsorted_features_in_contigs,
                             genome_stats_dirpath + '/features_frcurve_plot',
                             'genomic features')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_genes,
                genome_stats_dirpath + '/complete_features_histogram',
                '# complete genomic features')
        if ref_operons_num:
            plotter.genes_operons_plot(
                ref_operons_num, aligned_contigs_fpaths,
                files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths,
                             contigs_aligned_lengths,
                             files_unsorted_operons_in_contigs,
                             genome_stats_dirpath + '/operons_frcurve_plot',
                             'operons')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_operons,
                genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths,
                          genome_mapped,
                          genome_stats_dirpath + '/genome_fraction_histogram',
                          'Genome fraction, %',
                          top_value=100)

    logger.main_info('Done.')
    return containers