Exemplo n.º 1
0
def check_emem_functionality(logger):
    if not is_emem_aligner():
        return True
    logger.debug('Checking correctness of E-MEM compilation...')
    nucmer_output_dirpath = create_nucmer_output_dir(qconfig.output_dirpath)
    nucmer_fpath = join(nucmer_output_dirpath, 'test')
    return_code = run_nucmer(nucmer_fpath, options_parser.test_contigs_fpaths[0], options_parser.test_contigs_fpaths[1],
                             '/dev/null', '/dev/null', 0, emem_threads=1)
    if return_code != 0:
        logger.main_info('E-MEM does not work properly. QUAST will try to recompile contig aligner software.')
        open(e_mem_failed_compilation_flag, 'w').close()
    clean_tmp_files(nucmer_fpath)
    return compile_aligner(logger)
Exemplo n.º 2
0
def check_emem_functionality(logger):
    if not is_emem_aligner():
        return True
    logger.debug('Checking correctness of E-MEM compilation...')
    nucmer_output_dirpath = create_nucmer_output_dir(qconfig.output_dirpath)
    nucmer_fpath = join(nucmer_output_dirpath, 'test')
    return_code = run_nucmer(nucmer_fpath, options_parser.test_contigs_fpaths[0], options_parser.test_contigs_fpaths[1],
                             '/dev/null', '/dev/null', 0, emem_threads=1)
    if return_code != 0:
        if get_installed_emem():
            logger.main_info('Preinstalled E-MEM does not work properly.')
        else:
            logger.main_info('E-MEM does not work properly. QUAST will try to use Nucmer.')
        reset_aligner_selection()
        qconfig.force_nucmer = True
        safe_create(e_mem_failed_compilation_flag, logger, is_required=True)
    clean_tmp_files(nucmer_fpath)
    return compile_aligner(logger)
Exemplo n.º 3
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running GAGE...')

    metrics = [
        'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size',
        'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases',
        'Missing Assembly Contigs', 'Duplicated Reference Bases',
        'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs',
        'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation',
        'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'
    ]
    metrics_in_reporting = [
        reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG,
        reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50,
        reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
        reporting.Fields.GAGE_CHAFFBASES,
        reporting.Fields.GAGE_MISSINGREFBASES,
        reporting.Fields.GAGE_MISSINGASMBLYBASES,
        reporting.Fields.GAGE_MISSINGASMBLYCONTIGS,
        reporting.Fields.GAGE_DUPREFBASES,
        reporting.Fields.GAGE_COMPRESSEDREFBASES,
        reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY,
        reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS,
        reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS,
        reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION,
        reporting.Fields.GAGE_NUMCORCONTIGS,
        reporting.Fields.GAGE_CORASMBLYSIZE,
        reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING,
        reporting.Fields.GAGE_CORN50
    ]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    if not compile_aligner(logger) or (
            not all_required_java_classes_exist(gage_dirpath)
            and not compile_gage()):
        logger.error(
            'GAGE module was not installed properly, so it is disabled and you cannot use --gage.'
        )
        return

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.memory_efficient:
        return_codes = Parallel(n_jobs=n_jobs)(
            delayed(run_gage)(i, contigs_fpath, gage_results_dirpath,
                              gage_tool_path, ref_fpath, tmp_dirpath)
            for i, contigs_fpath in enumerate(contigs_fpaths))
    else:
        return_codes = [
            run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path,
                     ref_fpath, tmp_dirpath)
            for i, contigs_fpath in enumerate(contigs_fpaths)
        ]

    if 0 not in return_codes:
        logger.error('Error occurred while GAGE was processing assemblies.'
                     ' See GAGE error logs for details: %s' %
                     os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(gage_results_dirpath,
                                     'gage_' + corr_assembly_label + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(
                        metrics_in_reporting[cur_metric_id],
                        line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id],
                                     line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Exemplo n.º 4
0
    return any(c in args for c in cmds)


if 'bdist_wheel' in sys.argv:
    raise RuntimeError(
        "This setup.py does not support wheels. setup.py install will be run automatically..."
    )

if abspath(dirname(__file__)) != abspath(os.getcwd()):
    logger.error('Please change to ' + dirname(__file__) +
                 ' before running setup.py')
    sys.exit()

if cmd_in(['clean', 'sdist']):
    logger.info('Cleaning up binary files...')
    compile_aligner(logger, only_clean=True)
    compile_gnuplot(logger, only_clean=True)
    compile_glimmer(logger, only_clean=True)
    compile_gage(only_clean=True)
    compile_bwa(logger, only_clean=True)
    compile_bedtools(logger, only_clean=True)
    for fpath in [fn for fn in glob(join(quast_package, '*.pyc'))]:
        os.remove(fpath)
    for fpath in [
            fn for fn in glob(join(quast_package, 'html_saver', '*.pyc'))
    ]:
        os.remove(fpath)
    for fpath in [
            fn
            for fn in glob(join(quast_package, 'site_packages', '*', '*.pyc'))
    ]:
Exemplo n.º 5
0
def do(reference,
       contigs_fpaths,
       is_cyclic,
       output_dir,
       old_contigs_fpaths,
       bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )
        return dict(
            zip(contigs_fpaths,
                [AlignerStatus.FAILED] * len(contigs_fpaths))), None

    num_nf_errors = logger._num_nf_errors
    create_minimap_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed

    genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(
        reference, skip_ns=True)
    threads = qconfig.max_threads if qconfig.memory_efficient else threads
    args = [(is_cyclic, i, contigs_fpath, output_dir, reference,
             reference_chromosomes, ns_by_chromosomes, old_contigs_fpath,
             bed_fpath, threads)
            for i, (contigs_fpath, old_contigs_fpath
                    ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))]
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(
        align_and_analyze, args, n_jobs)
    reports = []

    aligner_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(
        zip(contigs_fpaths, aligned_lengths_by_contigs))

    if AlignerStatus.OK in aligner_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths,
                                    ref_labels_by_chromosomes, output_dir,
                                    logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == AlignerStatus.OK:
            reports.append(
                save_result(results[index], report, fname, reference,
                            genome_size))
        elif statuses[index] == AlignerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if AlignerStatus.OK in aligner_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(
                reports, join(output_dir, 'misassemblies_plot'),
                'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict(
                (contigs_fpaths[i], misassemblies_in_contigs[i])
                for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths,
                             misc.contigs_aligned_lengths,
                             misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'),
                             'misassemblies')

    oks = list(aligner_statuses.values()).count(AlignerStatus.OK)
    not_aligned = list(aligner_statuses.values()).count(
        AlignerStatus.NOT_ALIGNED)
    failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED)
    errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(aligner_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info(
            'Done for ' + str(all - problems) + ' out of ' + str(all) +
            '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )

    return aligner_statuses, aligned_lengths_per_fpath
Exemplo n.º 6
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    if qconfig.draw_plots:
        compile_gnuplot(logger, only_clean=False)

    num_nf_errors = logger._num_nf_errors
    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref and not qconfig.memory_efficient:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)]
    reports = []

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if NucmerStatus.OK in nucmer_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 7
0
args = sys.argv[1:]


def cmd_in(cmds):
    return any(c in args for c in cmds)


if abspath(dirname(__file__)) != abspath(os.getcwd()):
    logger.error('Please change to ' + dirname(__file__) + ' before running setup.py')
    sys.exit()


if cmd_in(['clean', 'sdist']):
    logger.info('Cleaning up binary files...')
    compile_aligner(logger, only_clean=True)
    compile_glimmer(logger, only_clean=True)
    compile_gage(only_clean=True)
    compile_bwa(logger, only_clean=True)
    compile_bedtools(logger, only_clean=True)
    for fpath in [fn for fn in glob(join(quast_package, '*.pyc'))]: os.remove(fpath)
    for fpath in [fn for fn in glob(join(quast_package, 'html_saver', '*.pyc'))]: os.remove(fpath)
    for fpath in [fn for fn in glob(join(quast_package, 'site_packages', '*', '*.pyc'))]: os.remove(fpath)

    if cmd_in(['clean']):
        if isdir('build'):
            shutil.rmtree('build')
        if isdir('dist'):
            shutil.rmtree('dist')
        if isdir(name + '.egg-info'):
            shutil.rmtree(name + '.egg-info')
Exemplo n.º 8
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if qconfig.test and is_emem_aligner():
        success_compilation = check_emem_functionality(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    if qconfig.draw_plots:
        compile_gnuplot(logger, only_clean=False)

    num_nf_errors = logger._num_nf_errors
    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref and not qconfig.memory_efficient:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)]
    reports = []

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if NucmerStatus.OK in nucmer_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        if not qconfig.test and is_emem_aligner():
            logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.')

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 9
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    num_nf_errors = logger._num_nf_errors
    success_compilation = compile_aligner(logger)
    if qconfig.test and is_emem_aligner():
        success_compilation = check_emem_functionality(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \
                                         [x[1] for x in statuses_results_lengths_tuples], \
                                         [x[2] for x in statuses_results_lengths_tuples]
    reports = []

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        if qconfig.draw_plots:
            from . import plotter
            plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        if not qconfig.test and is_emem_aligner():
            logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.')

    return nucmer_statuses, aligned_lengths_per_fpath
Exemplo n.º 10
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None

    num_nf_errors = logger._num_nf_errors
    create_minimap_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)

    genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(reference, skip_ns=True)
    threads = qconfig.max_threads if qconfig.memory_efficient else threads
    args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes,
            old_contigs_fpath, bed_fpath, threads)
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))]
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(align_and_analyze, args, n_jobs)
    reports = []

    aligner_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if AlignerStatus.OK in aligner_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == AlignerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference, genome_size))
        elif statuses[index] == AlignerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if AlignerStatus.OK in aligner_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(aligner_statuses.values()).count(AlignerStatus.OK)
    not_aligned = list(aligner_statuses.values()).count(AlignerStatus.NOT_ALIGNED)
    failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED)
    errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(aligner_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')

    return aligner_statuses, aligned_lengths_per_fpath
Exemplo n.º 11
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running GAGE...')

    metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases',
               'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs',
               'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp',
               'Indels >= 5', 'Inversions', 'Relocation', 'Translocation',
               'Total units', 'BasesInFasta', 'Min', 'Max', 'N50']
    metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, 
                            reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
                            reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, 
                            reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, 
                            reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, 
                            reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, 
                            reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, 
                            reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, 
                            reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    if not compile_aligner(logger) or (not all_required_java_classes_exist(gage_dirpath) and not compile_gage()):
        logger.error('GAGE module was not installed properly, so it is disabled and you cannot use --gage.')
        return

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.error('Error occurred while GAGE was processing assemblies.'
                     ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(
            gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Exemplo n.º 12
0
def do(reference,
       contigs_fpaths,
       cyclic,
       output_dir,
       old_contigs_fpaths,
       bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    num_nf_errors = logger._num_nf_errors

    if not compile_aligner(logger):
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )
        return dict(
            zip(contigs_fpaths,
                [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(int(qconfig.max_threads / n_jobs), 1)
    from joblib import Parallel, delayed
    if not qconfig.splitted_ref:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(
            delayed(align_and_analyze)(cyclic,
                                       i,
                                       contigs_fpath,
                                       output_dir,
                                       reference,
                                       old_contigs_fpath,
                                       bed_fpath,
                                       threads=threads)
            for i, (contigs_fpath, old_contigs_fpath
                    ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(
                qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(
                delayed(align_and_analyze)(cyclic,
                                           i,
                                           contigs_fpath,
                                           output_dir,
                                           reference,
                                           old_contigs_fpath,
                                           bed_fpath,
                                           threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(
                    zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(
                    zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(
                    align_and_analyze(cyclic,
                                      i,
                                      contigs_fpath,
                                      output_dir,
                                      reference,
                                      old_contigs_fpath,
                                      bed_fpath,
                                      parallel_by_chr=True,
                                      threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \
                                         [x[1] for x in statuses_results_lengths_tuples], \
                                         [x[2] for x in statuses_results_lengths_tuples]
    reports = []

    if qconfig.is_combined_ref:
        ref_misassemblies = [
            result['istranslocations_by_refs'] if result else []
            for result in results
        ]
        if ref_misassemblies:
            for i, fpath in enumerate(contigs_fpaths):
                if ref_misassemblies[i]:
                    assembly_name = qutils.name_from_fpath(fpath)
                    all_rows = []
                    all_refs = sorted(
                        list(
                            set([
                                ref
                                for ref in ref_labels_by_chromosomes.values()
                            ])))
                    row = {
                        'metricName': 'References',
                        'values':
                        [ref_num + 1 for ref_num in range(len(all_refs))]
                    }
                    all_rows.append(row)
                    for k in all_refs:
                        row = {'metricName': k, 'values': []}
                        for ref in all_refs:
                            if ref == k or ref not in ref_misassemblies[i]:
                                row['values'].append(None)
                            else:
                                row['values'].append(
                                    ref_misassemblies[i][ref][k])
                        all_rows.append(row)
                    misassembly_by_ref_fpath = join(
                        output_dir,
                        'interspecies_translocations_by_refs_%s.info' %
                        assembly_name)
                    print >> open(
                        misassembly_by_ref_fpath, 'w'
                    ), 'Number of interspecies translocations by references: \n'
                    print_file(all_rows,
                               misassembly_by_ref_fpath,
                               append_to_existing_file=True)

                    print >> open(misassembly_by_ref_fpath,
                                  'a'), '\nReferences: '
                    for ref_num, ref in enumerate(all_refs):
                        print >> open(misassembly_by_ref_fpath,
                                      'a'), str(ref_num + 1) + ' - ' + ref
                    logger.info(
                        '  Information about interspecies translocations by references for %s is saved to %s'
                        % (assembly_name, misassembly_by_ref_fpath))

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
    if qconfig.draw_plots:
        import plotter
        plotter.draw_misassembl_plot(reports,
                                     join(output_dir, 'misassemblies_plot'),
                                     'Misassemblies')

    oks = nucmer_statuses.values().count(NucmerStatus.OK)
    not_aligned = nucmer_statuses.values().count(NucmerStatus.NOT_ALIGNED)
    failed = nucmer_statuses.values().count(NucmerStatus.FAILED)
    errors = nucmer_statuses.values().count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info(
            'Done for ' + str(all - problems) + ' out of ' + str(all) +
            '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info(
            'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.'
        )

    return nucmer_statuses, aligned_lengths_per_fpath