Пример #1
0
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath,
                      alignments_fpath_template, labels):
    # array of assemblies for each reference
    assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), [])
                              for ref_fpath in ref_fpaths])
    n_jobs = min(qconfig.max_threads, len(assemblies))
    parallel_run_args = [(asm, assemblies_by_ref, corrected_dirpath,
                          alignments_fpath_template) for asm in assemblies]
    assemblies_dicts, not_aligned_assemblies = run_parallel(
        parallel_partition_contigs, parallel_run_args, n_jobs)
    assemblies_by_ref = []
    for ref_fpath in ref_fpaths:
        ref_name = qutils.name_from_fpath(ref_fpath)
        not_sorted_assemblies = set([
            val for sublist in (assemblies_dicts[i][ref_name]
                                for i in range(len(assemblies_dicts)))
            for val in sublist
        ])
        sorted_assemblies = []
        for label in labels:  # sort by label
            for assembly in not_sorted_assemblies:
                if assembly.label == label:
                    sorted_assemblies.append(assembly)
                    break
        assemblies_by_ref.append((ref_fpath, sorted_assemblies))
    return assemblies_by_ref, not_aligned_assemblies
Пример #2
0
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath,
                      alignments_fpath_template, labels):
    # array of assemblies for each reference
    assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), [])
                              for ref_fpath in ref_fpaths])
    n_jobs = min(qconfig.max_threads, len(assemblies))
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)(
        asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template)
                                         for asm in assemblies)
    assemblies_dicts = [assembly[0] for assembly in assemblies]
    assemblies_by_ref = []
    for ref_fpath in ref_fpaths:
        ref_name = qutils.name_from_fpath(ref_fpath)
        not_sorted_assemblies = set([
            val for sublist in (assemblies_dicts[i][ref_name]
                                for i in range(len(assemblies_dicts)))
            for val in sublist
        ])
        sorted_assemblies = []
        for label in labels:  # sort by label
            for assembly in not_sorted_assemblies:
                if assembly.label == label:
                    sorted_assemblies.append(assembly)
                    break
        assemblies_by_ref.append((ref_fpath, sorted_assemblies))
    not_aligned_assemblies = [assembly[1] for assembly in assemblies]
    return assemblies_by_ref, not_aligned_assemblies
Пример #3
0
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath):
    from quast_libs import reporting

    ref_reads_stats = None
    if ref_fpath:
        ref_name = qutils.name_from_fpath(ref_fpath)
        stats_fpath = join(output_dir, ref_name + '.stat')
        if isfile(stats_fpath):
            ref_reads_stats = parse_reads_stats(stats_fpath)
            if int(ref_reads_stats['mapped']) == 0:
                logger.info('  BWA: nothing aligned for reference.')

    # process all contigs files
    for index, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        stats_fpath = join(output_dir, assembly_name + '.stat')
        if ref_reads_stats:
            report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped'])
            report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt'])
            report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons'])
            report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt'])
            report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth'])
            if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
                report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS,
                                [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
                report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0])
        if not isfile(stats_fpath):
            continue
        reads_stats = parse_reads_stats(stats_fpath)
        report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total'])
        report.add_field(reporting.Fields.LEFT_READS, reads_stats['left'])
        report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right'])
        report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped'])
        report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt'])
        if int(reads_stats['mapped']) == 0:
            logger.info('  ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.')
        report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons'])
        report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt'])
        report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint'])
        report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt'])
        report.add_field(reporting.Fields.DEPTH, reads_stats['depth'])
        if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
            report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS,
                            [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
            report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
Пример #4
0
def prepare_regular_quast_args(quast_py_args, combined_output_dirpath):
    opts_with_args_to_remove = ['--contig-thresholds', '--sv-bed',]
    opts_to_remove = ['-s', '--scaffolds', '--combined-ref']
    for opt in opts_with_args_to_remove:
        remove_from_quast_py_args(quast_py_args, opt, arg=True)
    for opt in opts_to_remove:
        remove_from_quast_py_args(quast_py_args, opt)

    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold >= qconfig.min_contig])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]

    reads_stats_dirpath = os.path.join(combined_output_dirpath, qconfig.reads_stats_dirname)
    reference_name = qutils.name_from_fpath(qconfig.combined_ref_name)
    qconfig.bed = qconfig.bed or os.path.join(reads_stats_dirpath, reference_name + '.bed')
    qconfig.cov_fpath = qconfig.cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.cov')
    qconfig.phys_cov_fpath = qconfig.phys_cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.physical.cov')
    if qconfig.bed and is_non_empty_file(qconfig.bed):
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]
    if qconfig.cov_fpath and is_non_empty_file(qconfig.cov_fpath):
        quast_py_args += ['--cov']
        quast_py_args += [qconfig.cov_fpath]
    if qconfig.phys_cov_fpath and is_non_empty_file(qconfig.phys_cov_fpath):
        quast_py_args += ['--phys-cov']
        quast_py_args += [qconfig.phys_cov_fpath]
Пример #5
0
def search_sv_with_gridss(main_ref_fpath, bam_fpath, meta_ref_fpaths, output_dirpath, err_fpath):
    logger.info('  Searching structural variations with GRIDSS...')
    final_bed_fpath = join(output_dirpath, qutils.name_from_fpath(main_ref_fpath) + '_' + qconfig.sv_bed_fname)
    if isfile(final_bed_fpath):
        logger.info('    Using existing file: ' + final_bed_fpath)
        return final_bed_fpath

    if not get_path_to_program('java') or not check_java_version(1.8):
        logger.warning('Java 1.8 or later is required to run GRIDSS. Please install it and rerun QUAST.')
        return None
    if not get_path_to_program('Rscript'):
        logger.warning('R is required to run GRIDSS. Please install it and rerun QUAST.')
        return None

    if meta_ref_fpaths:
        n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads)
        threads_per_job = max(1, qconfig.max_threads // n_jobs)
        parallel_args = [(cur_ref_fpath, output_dirpath, err_fpath, threads_per_job) for cur_ref_fpath in meta_ref_fpaths]
        bed_fpaths = run_parallel(process_one_ref, parallel_args, n_jobs, filter_results=True)
        if bed_fpaths:
            qutils.cat_files(bed_fpaths, final_bed_fpath)
    else:
        process_one_ref(main_ref_fpath, output_dirpath, err_fpath, qconfig.max_threads, bam_fpath=bam_fpath, bed_fpath=final_bed_fpath)
    logger.info('    Saving to: ' + final_bed_fpath)
    return final_bed_fpath
Пример #6
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = [
        qutils.label_from_fpath(this) for this in reporting.assembly_fpaths
    ]
    report = reporting.table(reporting.Fields.grouped_order)
    subreports = []
    ref_names = []
    if qconfig.is_combined_ref and ref_labels_by_chromosomes:
        ref_names = sorted(
            list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        subreports = [
            reporting.table(reporting.Fields.grouped_order, ref_name=ref_name)
            for ref_name in ref_names
        ]
    t = datetime.datetime.now()

    return save(
        join(output_dirpath, total_report_fname), {
            'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
            'assembliesNames': asm_names,
            'referenceName':
            qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
            'order': [i for i, _ in enumerate(asm_names)],
            'report': report,
            'subreferences': ref_names,
            'subreports': subreports,
            'minContig': min_contig
        })
Пример #7
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads):
    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)

    tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl')
    err_file = open(err_fpath, 'w')
    fasta_name = qutils.name_from_fpath(fasta_fpath)
    return_code = qutils.call_subprocess(
        ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath,
         fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))
    if return_code != 0:
        return

    genes = []
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name)
    out_fpath = sub_fasta_fpath + '.gmhmm'
    heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod')
    with open(err_fpath, 'a') as err_file:
        ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath,
                   out_fpath, err_file, index)
        if ok:
            genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Пример #8
0
def _run_quast_per_ref(quast_py_args,
                       output_dirpath_per_ref,
                       ref_fpath,
                       ref_assemblies,
                       total_num_notifications,
                       is_parallel_run=False):
    ref_name = qutils.name_from_fpath(ref_fpath)
    if not ref_assemblies:
        logger.main_info('\nNo contigs were aligned to the reference ' +
                         ref_name + ', skipping..')
        return None, None, total_num_notifications
    else:
        output_dirpath = os.path.join(output_dirpath_per_ref, ref_name)
        run_name = 'for the contigs aligned to ' + ref_name
        logger.main_info(
            '\nStarting quast.py ' + run_name + '... (logging to ' +
            os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME) +
            '.log)')

        return_code, total_num_notifications = _start_quast_main(
            quast_py_args,
            assemblies=ref_assemblies,
            reference_fpath=ref_fpath,
            output_dirpath=output_dirpath,
            num_notifications_tuple=total_num_notifications,
            is_parallel_run=is_parallel_run)
        json_text = None
        if qconfig.html_report:
            from quast_libs.html_saver import json_saver
            json_text = json_saver.json_text
        return ref_name, json_text, total_num_notifications
Пример #9
0
def search_sv_with_gridss(main_ref_fpath, bam_fpath, meta_ref_fpaths, output_dirpath, err_fpath):
    logger.info('  Searching structural variations with GRIDSS...')
    final_bed_fpath = join(output_dirpath, qutils.name_from_fpath(main_ref_fpath) + '_' + qconfig.sv_bed_fname)
    if isfile(final_bed_fpath):
        logger.info('    Using existing file: ' + final_bed_fpath)
        return final_bed_fpath

    if not get_path_to_program('java') or not check_java_version(1.8):
        logger.warning('Java 1.8 or later is required to run GRIDSS. Please install it and rerun QUAST.')
        return None
    if not get_path_to_program('Rscript'):
        logger.warning('R is required to run GRIDSS. Please install it and rerun QUAST.')
        return None

    if meta_ref_fpaths:
        n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads)
        threads_per_job = max(1, qconfig.max_threads // n_jobs)
        parallel_args = [(cur_ref_fpath, output_dirpath, err_fpath, threads_per_job) for cur_ref_fpath in meta_ref_fpaths]
        bed_fpaths = run_parallel(process_one_ref, parallel_args, n_jobs, filter_results=True)
        if bed_fpaths:
            qutils.cat_files(bed_fpaths, final_bed_fpath)
    else:
        process_one_ref(main_ref_fpath, output_dirpath, err_fpath, qconfig.max_threads, bam_fpath=bam_fpath, bed_fpath=final_bed_fpath)
    logger.info('    Saving to: ' + final_bed_fpath)
    return final_bed_fpath
Пример #10
0
def get(assembly_fpath, ref_name=None):
    if not ref_name and qconfig.reference:
        ref_name = qutils.name_from_fpath(qconfig.reference)
    if assembly_fpath not in assembly_fpaths:
        assembly_fpaths.append(assembly_fpath)
    return reports.setdefault((os.path.abspath(assembly_fpath), ref_name),
                              Report(qutils.label_from_fpath(assembly_fpath)))
Пример #11
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath,
                    num_threads):
    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)

    tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl')
    err_file = open(err_fpath, 'w')
    fasta_name = qutils.name_from_fpath(fasta_fpath)
    return_code = qutils.call_subprocess([
        'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out',
        tmp_dirpath, fasta_fpath
    ],
                                         stdout=err_file,
                                         stderr=err_file,
                                         indent='    ' +
                                         qutils.index_to_str(index))
    if return_code != 0:
        return

    genes = []
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name)
    out_fpath = sub_fasta_fpath + '.gmhmm'
    heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod')
    with open(err_fpath, 'a') as err_file:
        ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath,
                     err_file, index)
        if ok:
            genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Пример #12
0
def prepare_regular_quast_args(quast_py_args, combined_output_dirpath):
    opts_with_args_to_remove = ['--contig-thresholds', '--sv-bed',]
    opts_to_remove = ['-s', '--scaffolds', '--combined-ref']
    for opt in opts_with_args_to_remove:
        remove_from_quast_py_args(quast_py_args, opt, arg=True)
    for opt in opts_to_remove:
        remove_from_quast_py_args(quast_py_args, opt)

    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold >= qconfig.min_contig])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]

    reads_stats_dirpath = os.path.join(combined_output_dirpath, qconfig.reads_stats_dirname)
    reference_name = qutils.name_from_fpath(qconfig.combined_ref_name)
    qconfig.bed = qconfig.bed or os.path.join(reads_stats_dirpath, reference_name + '.bed')
    qconfig.cov_fpath = qconfig.cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.cov')
    qconfig.phys_cov_fpath = qconfig.phys_cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.physical.cov')
    if qconfig.bed and is_non_empty_file(qconfig.bed):
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]
    if qconfig.cov_fpath and is_non_empty_file(qconfig.cov_fpath):
        quast_py_args += ['--cov']
        quast_py_args += [qconfig.cov_fpath]
    if qconfig.phys_cov_fpath and is_non_empty_file(qconfig.phys_cov_fpath):
        quast_py_args += ['--phys-cov']
        quast_py_args += [qconfig.phys_cov_fpath]
Пример #13
0
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath,
          num_threads):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl')
    libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib')
    err_file = open(err_fpath, 'w')
    tmp_dirpath += qutils.name_from_fpath(fasta_fpath)
    if not os.path.isdir(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    return_code = qutils.call_subprocess([
        'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores',
        str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath
    ] + (['--fungus'] if qconfig.is_fungus else []),
                                         stdout=err_file,
                                         stderr=err_file,
                                         indent='    ' +
                                         qutils.index_to_str(index))
    if return_code != 0:
        return
    genes = []
    fnames = [
        fname for (path, dirs, files) in os.walk(tmp_dirpath)
        for fname in files
    ]
    for fname in fnames:
        if fname.endswith('gtf'):
            genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname)))
    return genes
Пример #14
0
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False):
    red_genome_dir = os.path.join(tmp_dir, 'tmp_red')
    if isdir(red_genome_dir):
        shutil.rmtree(red_genome_dir)
    os.makedirs(red_genome_dir)

    ref_name = qutils.name_from_fpath(ref_fpath)
    ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa')  ## Red recognizes only *.fa files
    if os.path.islink(ref_symlink):
        os.remove(ref_symlink)
    os.symlink(ref_fpath, ref_symlink)

    logger.info('  ' + 'Running repeat masking tool...')
    repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt')
    if is_non_empty_file(repeats_fpath):
        return_code = 0
        logger.info('  ' + 'Using existing file ' + repeats_fpath + '...')
    else:
        return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'],
                                             stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent='    ')
    if return_code == 0 and repeats_fpath and exists(repeats_fpath):
        long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt')
        with open(long_repeats_fpath, 'w') as out:
            with open(repeats_fpath) as in_f:
                for line in in_f:
                    l = line.split('\t')
                    repeat_len = int(l[2]) - int(l[1])
                    if repeat_len >= insert_size:
                        out.write(line[1:])

        repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta')
        coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt')
        if not is_non_empty_file(coords_fpath):
            fasta_index_fpath = ref_fpath + '.fai'
            if exists(fasta_index_fpath):
                os.remove(fasta_index_fpath)
            qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed',
                                    long_repeats_fpath, '-fo', repeats_fasta_fpath],
                                    stderr=open(log_fpath, 'w'), indent='    ')
            cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100',
                       '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath]
            qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a'))
        filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads)
        unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath)
        return unique_covered_regions, repeats_regions
    return None, None
Пример #15
0
def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False):
    required_files = []
    ref_name = qutils.name_from_fpath(ref_fpath)
    cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    if using_reads != 'all':
        cov_fpath = add_suffix(cov_fpath, using_reads)
        uncovered_fpath = add_suffix(uncovered_fpath, using_reads)
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if not is_non_empty_file(uncovered_fpath):
        required_files.append(uncovered_fpath)
    if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'):
        required_files.append(insert_size_fpath)

    temp_output_dir = join(output_dir, 'temp_output')
    if not isdir(temp_output_dir):
        os.makedirs(temp_output_dir)

    log_path = join(output_dir, 'reads_stats.log')
    err_fpath = join(output_dir, 'reads_stats.err')
    correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                                                qconfig.max_threads, sam_fpath=qconfig.reference_sam,
                                                                bam_fpath=qconfig.reference_bam, required_files=required_files,
                                                                is_reference=True, alignment_only=True, using_reads=using_reads)
    if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto':
        if using_reads == 'pe' and sam_fpath:
            insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name)
            if not insert_size:
                logger.info('  Failed calculating insert size.')
            else:
                qconfig.optimal_assembly_insert_size = insert_size
        elif using_reads == 'all' and is_non_empty_file(insert_size_fpath):
            try:
                insert_size = int(open(insert_size_fpath).readline())
                if insert_size:
                    qconfig.optimal_assembly_insert_size = insert_size
            except:
                pass

    if not required_files:
        return sam_fpath, bam_fpath, uncovered_fpath
    if not sam_fpath:
        logger.info('  Failed detecting uncovered regions.')
        return None, None

    if calculate_coverage:
        bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
        bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

        if is_non_empty_file(bam_sorted_fpath):
            logger.info('  Using existing sorted BAM-file: ' + bam_sorted_fpath)
        else:
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        if not is_non_empty_file(uncovered_fpath) and calculate_coverage:
            get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath,
                         correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return sam_fpath, bam_fpath, uncovered_fpath
Пример #16
0
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger):
    ref_misassemblies = [result['istranslocations_by_refs'] if result else [] for result in results]
    potential_misassemblies_by_refs = [result['potential_misassemblies_by_refs'] if result else [] for result in results]
    all_refs = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
    misassemblies_by_refs_rows = []
    row = {'metricName': 'References', 'values': all_refs}
    misassemblies_by_refs_rows.append(row)
    if ref_misassemblies:
        for i, fpath in enumerate(contigs_fpaths):
            row = {'metricName': qutils.label_from_fpath(fpath), 'values': []}
            misassemblies_by_refs_rows.append(row)
            if ref_misassemblies[i]:
                assembly_name = qutils.name_from_fpath(fpath)
                all_rows = []
                row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]}
                all_rows.append(row)
                for k in all_refs:
                    row = {'metricName': k, 'values': []}
                    for ref in all_refs:
                        if ref == k or ref not in ref_misassemblies[i]:
                            row['values'].append(None)
                        else:
                            row['values'].append(ref_misassemblies[i][ref][k])
                    misassemblies_by_refs_rows[-1]['values'].append(max(0, sum([r for r in row['values'] if r]) +
                                                                        potential_misassemblies_by_refs[i][k]))
                    all_rows.append(row)
                misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name)
                with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file:
                    misassembly_by_ref_file.write('Number of interspecies translocations by references: \n')
                print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True)

                with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file:
                    misassembly_by_ref_file.write('References:\n')
                    for ref_num, ref in enumerate(all_refs):
                        misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n')
                logger.info('  Information about interspecies translocations by references for %s is saved to %s' %
                            (assembly_name, misassembly_by_ref_fpath))
    misassemblies = []
    if qconfig.draw_plots:
        from quast_libs import plotter

        aligned_contigs_labels = []
        for row in misassemblies_by_refs_rows[1:]:
            if row['values']:
                aligned_contigs_labels.append(row['metricName'])
            else:
                misassemblies_by_refs_rows.remove(row)
        for i in range(len(all_refs)):
            cur_results = []
            for row in misassemblies_by_refs_rows[1:]:
                if row['values']:
                    cur_results.append(row['values'][i])
            misassemblies.append(cur_results)
        is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies.' + qconfig.plot_extension)
        plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies_by_refs_rows,
                                       misassemblies, is_translocations_plot_fpath,
                                       title='Intergenomic misassemblies (found and supposed)', reverse=False,
                                       yaxis_title=None, print_all_refs=True)
Пример #17
0
    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
Пример #18
0
    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
Пример #19
0
def align_reference(ref_fpath, output_dir, using_reads='all'):
    required_files = []
    ref_name = qutils.name_from_fpath(ref_fpath)
    cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    if using_reads != 'all':
        cov_fpath = add_suffix(cov_fpath, using_reads)
        uncovered_fpath = add_suffix(uncovered_fpath, using_reads)
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if not is_non_empty_file(uncovered_fpath):
        required_files.append(uncovered_fpath)
    if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'):
        required_files.append(insert_size_fpath)

    temp_output_dir = join(output_dir, 'temp_output')
    if not isdir(temp_output_dir):
        os.makedirs(temp_output_dir)

    log_path = join(output_dir, 'reads_stats.log')
    err_fpath = join(output_dir, 'reads_stats.err')
    correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                                                qconfig.max_threads, sam_fpath=qconfig.reference_sam,
                                                                bam_fpath=qconfig.reference_bam, required_files=required_files,
                                                                is_reference=True, alignment_only=True, using_reads=using_reads)
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath

    if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto':
        if using_reads == 'paired_end' and sam_fpath:
            insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name)
            if not insert_size:
                logger.info('  Failed calculating insert size.')
            else:
                qconfig.ideal_assembly_insert_size = insert_size

    if not required_files:
        return bam_fpath, uncovered_fpath
    if not sam_fpath:
        logger.info('  Failed detecting uncovered regions.')
        return None, None

    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(bam_sorted_fpath):
        logger.info('  Using existing sorted BAM-file: ' + bam_sorted_fpath)
    else:
        sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    if not is_non_empty_file(uncovered_fpath):
        get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath,
                     correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return bam_fpath, uncovered_fpath
Пример #20
0
def group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath):
    if contig_names_by_refs:
        added_refs = set()
        chr_full_names = [added_refs.add(ref) or ref for ref in [contig_names_by_refs[contig] for contig in chr_names]
                          if ref not in added_refs]
    elif sum(chromosomes_length.values()) < qconfig.MAX_SIZE_FOR_COMB_PLOT and len(chr_names) > 1:
        chr_full_names = [qutils.name_from_fpath(ref_fpath)]
    else:
        contig_names_by_refs = dict()
        chr_full_names = chr_names
        for i in range(len(chr_names)):
            contig_names_by_refs[chr_names[i]] = chr_full_names[i]
    return chr_full_names, contig_names_by_refs
Пример #21
0
def group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath):
    if contig_names_by_refs:
        added_refs = set()
        chr_full_names = [added_refs.add(ref) or ref for ref in [contig_names_by_refs[contig] for contig in chr_names]
                          if ref not in added_refs]
    elif sum(chromosomes_length.values()) < qconfig.MAX_SIZE_FOR_COMB_PLOT and len(chr_names) > 1:
        chr_full_names = [qutils.name_from_fpath(ref_fpath)]
    else:
        contig_names_by_refs = dict()
        chr_full_names = chr_names
        for i in range(len(chr_names)):
            contig_names_by_refs[chr_names[i]] = chr_full_names[i]
    return chr_full_names, contig_names_by_refs
Пример #22
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths]
    report = reporting.table(reporting.Fields.grouped_order)
    t = datetime.datetime.now()

    return save(join(output_dirpath, total_report_fname), {
        'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
        'assembliesNames': asm_names,
        'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
        'order': [i for i, _ in enumerate(asm_names)],
        'report': report,
        'minContig': min_contig,
        'assembliesWithNs': qconfig.potential_scaffolds_assemblies if qconfig.potential_scaffolds_assemblies else None
    })
Пример #23
0
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels):
    # array of assemblies for each reference
    assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths])
    n_jobs = min(qconfig.max_threads, len(assemblies))
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)(asm,
                                assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies)
    assemblies_dicts = [assembly[0] for assembly in assemblies]
    assemblies_by_ref = []
    for ref_fpath in ref_fpaths:
        ref_name = qutils.name_from_fpath(ref_fpath)
        not_sorted_assemblies = set([val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist])
        sorted_assemblies = []
        for label in labels:  # sort by label
            for assembly in not_sorted_assemblies:
                if assembly.label == label:
                    sorted_assemblies.append(assembly)
                    break
        assemblies_by_ref.append((ref_fpath, sorted_assemblies))
    not_aligned_assemblies = [assembly[1] for assembly in assemblies]
    return assemblies_by_ref, not_aligned_assemblies
Пример #24
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = map(qutils.label_from_fpath, reporting.assembly_fpaths)
    report = reporting.table(reporting.Fields.grouped_order)
    t = datetime.datetime.now()

    return save(join(output_dirpath, total_report_fname), {
        'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
        'assembliesNames': asm_names,
        'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
        'order': [i for i, _ in enumerate(asm_names)],
        'report': report,
        'minContig': min_contig,
        'assembliesWithNs': qconfig.potential_scaffolds_assemblies if qconfig.potential_scaffolds_assemblies else None
    })
Пример #25
0
def _start_quast_main(args, assemblies, reference_fpath=None, output_dirpath=None, num_notifications_tuple=None,
                      labels=None, run_regular_quast=False, is_combined_ref=False, is_parallel_run=False):
    args = args[:]

    args.extend([asm.fpath for asm in assemblies])

    if reference_fpath:
        args.append('-R')
        args.append(reference_fpath)

    if output_dirpath:
        args.append('-o')
        args.append(output_dirpath)

    args.append('--labels')

    def quote(line):
        if ' ' in line:
            line = '"%s"' % line
        return line

    args.append(quote(', '.join([asm.label for asm in assemblies])))

    import quast
    try:
        import imp
        imp.reload(quast)
    except:    
        reload(quast)
    quast.logger.set_up_console_handler(indent_val=1, debug=qconfig.debug)
    if not run_regular_quast:
        reference_name = os.path.basename(qutils.name_from_fpath(reference_fpath)) if reference_fpath else None
        quast.logger.set_up_metaquast(is_parallel_run=is_parallel_run, ref_name=reference_name)
    if is_combined_ref:
        logger.info_to_file('(logging to ' +
                        os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME + '.log)'))
    return_code = quast.main(args)
    if num_notifications_tuple:
        cur_num_notifications = quast.logger.get_numbers_of_notifications()
        num_notifications_tuple = list(map(sum, zip(num_notifications_tuple, cur_num_notifications)))

    if is_combined_ref:
        labels[:] = [qconfig.assembly_labels_by_fpath[fpath] for fpath in qconfig.assemblies_fpaths]
        assemblies[:] = [Assembly(fpath, qconfig.assembly_labels_by_fpath[fpath]) for fpath in qconfig.assemblies_fpaths]

    return return_code, num_notifications_tuple
Пример #26
0
def _start_quast_main(args, assemblies, reference_fpath=None, output_dirpath=None, num_notifications_tuple=None,
                      labels=None, run_regular_quast=False, is_combined_ref=False, is_parallel_run=False):
    args = args[:]

    args.extend([asm.fpath for asm in assemblies])

    if reference_fpath:
        args.append('-R')
        args.append(reference_fpath)

    if output_dirpath:
        args.append('-o')
        args.append(output_dirpath)

    args.append('--labels')

    def quote(line):
        if ' ' in line:
            line = '"%s"' % line
        return line

    args.append(quote(', '.join([asm.label for asm in assemblies])))

    import quast
    try:
        import importlib
        importlib.reload(quast)
    except (ImportError, AttributeError):
        reload(quast)
    quast.logger.set_up_console_handler(indent_val=1, debug=qconfig.debug)
    if not run_regular_quast:
        reference_name = os.path.basename(qutils.name_from_fpath(reference_fpath)) if reference_fpath else None
        quast.logger.set_up_metaquast(is_parallel_run=is_parallel_run, ref_name=reference_name)
    if is_combined_ref:
        logger.info_to_file('(logging to ' +
                        os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME + '.log)'))
    return_code = quast.main(args)
    if num_notifications_tuple:
        cur_num_notifications = quast.logger.get_numbers_of_notifications()
        num_notifications_tuple = list(map(sum, zip(num_notifications_tuple, cur_num_notifications)))

    if is_combined_ref:
        labels[:] = [qconfig.assembly_labels_by_fpath[fpath] for fpath in qconfig.assemblies_fpaths]
        assemblies[:] = [Assembly(fpath, qconfig.assembly_labels_by_fpath[fpath]) for fpath in qconfig.assemblies_fpaths]

    return return_code, num_notifications_tuple
Пример #27
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Пример #28
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped and proper_pair')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Пример #29
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None):
    ref = qutils.name_from_fpath(cur_ref_fpath)
    ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
    ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam')
    ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted.bam')
    ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed')
    if os.path.getsize(ref_sam_fpath) < 1024 * 1024:  # TODO: make it better (small files will cause Manta crush -- "not enough reads...")
        logger.info('  SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024))
        return None
    if is_non_empty_file(ref_bed_fpath):
        logger.info('  Using existing Manta BED-file: ' + ref_bed_fpath)
        return ref_bed_fpath
    if not os.path.exists(ref_bamsorted_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-S', '-f', 'bam',
                                ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), ref_bam_fpath,
                                '-o', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger)
    if not is_non_empty_file(ref_bamsorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', ref_bamsorted_fpath],
                               stderr=open(err_path, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta')
    found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz')
    unpacked_SV_fpath = found_SV_fpath + '.unpacked'
    if not is_non_empty_file(found_SV_fpath):
        if os.path.exists(vcfoutput_dirpath):
            shutil.rmtree(vcfoutput_dirpath, ignore_errors=True)
        os.makedirs(vcfoutput_dirpath)
        qutils.call_subprocess([config_manta_fpath, '--normalBam', ref_bamsorted_fpath,
                                '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath],
                               stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
        if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')):
            return None
        env = os.environ.copy()
        env['LC_ALL'] = 'C'
        qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)],
                               stderr=open(err_path, 'a'), logger=logger, env=env)
    if not is_non_empty_file(unpacked_SV_fpath):
        cmd = 'gunzip -c %s' % found_SV_fpath
        qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
    from quast_libs.manta import vcfToBedpe
    vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w'))
    return ref_bed_fpath
Пример #30
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None):
    ref = qutils.name_from_fpath(cur_ref_fpath)
    ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
    ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam')
    ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted.bam')
    ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed')
    if os.path.getsize(ref_sam_fpath) < 1024 * 1024:  # TODO: make it better (small files will cause Manta crush -- "not enough reads...")
        logger.info('  SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024))
        return None
    if is_non_empty_file(ref_bed_fpath):
        logger.info('  Using existing Manta BED-file: ' + ref_bed_fpath)
        return ref_bed_fpath
    if not os.path.exists(ref_bamsorted_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-S', '-f', 'bam',
                                ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), ref_bam_fpath,
                                '-o', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger)
    if not is_non_empty_file(ref_bamsorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', ref_bamsorted_fpath],
                               stderr=open(err_path, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta')
    found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz')
    unpacked_SV_fpath = found_SV_fpath + '.unpacked'
    if not is_non_empty_file(found_SV_fpath):
        if os.path.exists(vcfoutput_dirpath):
            shutil.rmtree(vcfoutput_dirpath, ignore_errors=True)
        os.makedirs(vcfoutput_dirpath)
        qutils.call_subprocess([get_manta_fpath(), '--normalBam', ref_bamsorted_fpath,
                                '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath],
                               stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
        if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')):
            return None
        env = os.environ.copy()
        env['LC_ALL'] = 'C'
        qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)],
                               stderr=open(err_path, 'a'), logger=logger, env=env)
    if not is_non_empty_file(unpacked_SV_fpath):
        cmd = 'gunzip -c %s' % found_SV_fpath
        qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
    from quast_libs.ra_utils import vcfToBedpe
    vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w'))
    return ref_bed_fpath
Пример #31
0
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            if reads_type == 'pacbio' or reads_type == 'nanopore':
                if reads_type == 'pacbio':
                    preset = ' -ax map-pb '
                else:
                    preset = ' -ax map-ont '
                cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads
            else:
                cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        bam_fpath = output_fpath.replace('.sam', '.bam')
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        if not is_non_empty_file(bam_fpath):
            if not is_non_empty_file(bam_fpath):
                sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
            if reads_type == 'pe':
                bam_dedup_fpath = add_suffix(bam_fpath, 'dedup')
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                                        output_dir, bam_fpath, bam_dedup_fpath],
                                        stderr=open(err_fpath, 'a'), logger=logger)
                if exists(bam_dedup_fpath):
                    shutil.move(bam_dedup_fpath, bam_fpath)
        if reads_type == 'pe':
            insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
            if insert_size < qconfig.optimal_assembly_max_IS:
                insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.optimal_assembly_insert_size = max(insert_sizes)
        ref_name = qutils.name_from_fpath(ref_fpath)
        insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt')
        with open(insert_size_fpath, 'w') as out:
            out.write(str(qconfig.optimal_assembly_insert_size))
Пример #32
0
def save_total_report(output_dirpath, min_contig, ref_fpath):
    from quast_libs import reporting
    asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths]
    report = reporting.table(reporting.Fields.grouped_order)
    subreports = []
    ref_names = []
    if qconfig.is_combined_ref and ref_labels_by_chromosomes:
        ref_names = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        subreports = [reporting.table(reporting.Fields.grouped_order, ref_name=ref_name) for ref_name in ref_names]
    t = datetime.datetime.now()

    return save(join(output_dirpath, total_report_fname), {
        'date': t.strftime('%d %B %Y, %A, %H:%M:%S'),
        'assembliesNames': asm_names,
        'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '',
        'order': [i for i, _ in enumerate(asm_names)],
        'report': report,
        'subreferences': ref_names,
        'subreports': subreports,
        'minContig': min_contig
    })
Пример #33
0
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl')
    libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib')
    err_file = open(err_fpath, 'w')
    tmp_dirpath += qutils.name_from_fpath(fasta_fpath)
    if not os.path.isdir(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    return_code = qutils.call_subprocess(
        ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath,
         '--out', tmp_dirpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))
    if return_code != 0:
        return
    genes = []
    fnames = [fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files]
    for fname in fnames:
        if fname.endswith('gtf'):
            genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname)))
    return genes
Пример #34
0
def _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications, is_parallel_run=False):
    ref_name = qutils.name_from_fpath(ref_fpath)
    if not ref_assemblies:
        logger.main_info('\nNo contigs were aligned to the reference ' + ref_name + ', skipping..')
        return None, None, total_num_notifications
    else:
        output_dirpath = os.path.join(output_dirpath_per_ref, ref_name)
        run_name = 'for the contigs aligned to ' + ref_name
        logger.main_info('\nStarting quast.py ' + run_name +
                         '... (logging to ' + os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME) + '.log)')

        return_code, total_num_notifications = _start_quast_main(quast_py_args,
                                                                 assemblies=ref_assemblies,
                                                                 reference_fpath=ref_fpath,
                                                                 output_dirpath=output_dirpath,
                                                                 num_notifications_tuple=total_num_notifications,
                                                                 is_parallel_run=is_parallel_run)
        json_text = None
        if qconfig.html_report:
            from quast_libs.html_saver import json_saver
            json_text = json_saver.json_text
        return ref_name, json_text, total_num_notifications
Пример #35
0
def analyse_coverage(output_dirpath, fpath, chr_names, bam_fpath, stats_fpath, err_fpath, logger):
    filename = qutils.name_from_fpath(fpath)
    bed_fpath = bam_to_bed(output_dirpath, filename, bam_fpath, err_fpath, logger)
    chr_len_fpath = get_chr_len_fpath(fpath, chr_names)
    cov_fpath = join(output_dirpath, filename + '.genomecov')
    calculate_genome_cov(bed_fpath, cov_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=False)

    avg_depth = 0
    coverage_for_thresholds = [0 for threshold in qconfig.coverage_thresholds]
    with open(cov_fpath) as f:
        for line in f:
            l = line.split()  # genome; depth; number of bases; size of genome; fraction of bases with depth
            depth, genome_fraction = int(l[1]), float(l[4])
            if l[0] == 'genome':
                avg_depth += depth * genome_fraction
                for i, threshold in enumerate(qconfig.coverage_thresholds):
                    if depth >= threshold:
                        coverage_for_thresholds[i] += genome_fraction

    with open(stats_fpath, 'a') as out_f:
        out_f.write('%s depth\n' % int(avg_depth))
        for i, threshold in enumerate(qconfig.coverage_thresholds):
            out_f.write('%.2f coverage >= %sx\n' % (coverage_for_thresholds[i] * 100, threshold))
Пример #36
0
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath):
    sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam')
    bam_fpath = sam_fpath.replace('.sam', '.bam')
    bam_mapped_fpath = add_suffix(bam_fpath, 'mapped')
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_fpath):
        bwa_index(ref_fpath, err_fpath, logger)
        qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath],
                               stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    if not is_non_empty_file(bam_sorted_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'not unmapped', bam_fpath],
                               stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    ref_name = qutils.name_from_fpath(ref_fpath)
    correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger)
    get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath,
                 correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return uncovered_fpath
Пример #37
0
def analyse_coverage(output_dirpath, fpath, chr_names, bam_fpath, stats_fpath, err_fpath, logger):
    filename = qutils.name_from_fpath(fpath)
    bed_fpath = bam_to_bed(output_dirpath, filename, bam_fpath, err_fpath, logger)
    chr_len_fpath = get_chr_len_fpath(fpath, chr_names)
    cov_fpath = join(output_dirpath, filename + '.genomecov')
    calculate_genome_cov(bed_fpath, cov_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=False)

    avg_depth = 0
    coverage_for_thresholds = [0 for threshold in qconfig.coverage_thresholds]
    with open(cov_fpath) as f:
        for line in f:
            l = line.split()  # genome; depth; number of bases; size of genome; fraction of bases with depth
            depth, genome_fraction = int(l[1]), float(l[4])
            if l[0] == 'genome':
                avg_depth += depth * genome_fraction
                for i, threshold in enumerate(qconfig.coverage_thresholds):
                    if depth >= threshold:
                        coverage_for_thresholds[i] += genome_fraction

    with open(stats_fpath, 'a') as out_f:
        out_f.write('%s depth\n' % int(avg_depth))
        for i, threshold in enumerate(qconfig.coverage_thresholds):
            out_f.write('%.2f coverage >= %sx\n' % (coverage_for_thresholds[i] * 100, threshold))
Пример #38
0
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir,
                         log_path, err_fpath):
    required_files = []
    bed_fpath, cov_fpath, physical_cov_fpath = None, None, None
    if main_ref_fpath:
        ref_name = qutils.name_from_fpath(main_ref_fpath)

        bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed')
        cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
        physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov')
        required_files = [bed_fpath, cov_fpath, physical_cov_fpath]

        if qconfig.no_sv:
            logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
            bed_fpath = None
        elif is_non_empty_file(bed_fpath):
            logger.info('  Using existing BED-file: ' + bed_fpath)
        elif not qconfig.forward_reads and not qconfig.interlaced_reads:
            if not qconfig.reference_sam and not qconfig.reference_bam:
                logger.info('  Will not search Structural Variations (needs paired-end reads)')
                bed_fpath = None
                qconfig.no_sv = True
        if qconfig.create_icarus_html:
            if is_non_empty_file(cov_fpath):
                is_correct_file = check_cov_file(cov_fpath)
                if is_correct_file:
                    logger.info('  Using existing reads coverage file: ' + cov_fpath)
            if is_non_empty_file(physical_cov_fpath):
                logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
        else:
            logger.info('  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)')
            cov_fpath = None
            physical_cov_fpath = None
        if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
                (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
            required_files = []

    n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1)
    max_threads_per_job = max(1, qconfig.max_threads // n_jobs)
    sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths)
    bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths)
    parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job,
                            sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)]
    if main_ref_fpath:
        parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                    max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True))
    correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs)
    qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)]
    qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)]
    add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath)
    save_reads(output_dir)
    if not main_ref_fpath:
        return None, None, None

    correct_chr_names = correct_chr_names[-1]
    sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1]
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath
    if not required_files:
        return bed_fpath, cov_fpath, physical_cov_fpath
    if not all([sam_fpath, bam_fpath]):
        logger.info('  Failed searching structural variations.')
        return None, None, None

    sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted'))
    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        if not is_non_empty_file(bam_sorted_fpath):
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger)
    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_lengths = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_lengths[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        ref_files = {}
        if meta_ref_fpaths:
            global ref_sam_fpaths
            for cur_ref_fpath in meta_ref_fpaths:
                cur_ref_name = qutils.name_from_fpath(cur_ref_fpath)
                ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam')
                ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath
                if is_non_empty_file(ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath))
                    ref_files[cur_ref_name] = None
                else:
                    ref_sam_file = open(ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        ref_sam_file.write(headers[0] + '\n')
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name:
                            ref_sam_file.write(h + '\n')
                    ref_sam_file.write(headers[-1] + '\n')
                    ref_files[cur_ref_name] = ref_sam_file
                    need_ref_splitting = True

        trivial_deletions_fpath = \
            search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting)
        if get_gridss_fpath() and isfile(get_gridss_fpath()):
            try:
                gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath)
                qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Пример #39
0
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None,
                      index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'):
    filename = qutils.name_from_fpath(fpath)
    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam')
    if using_reads != 'all':
        sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam')
        bam_fpath = sam_fpath.replace('.sam', '.bam')
    if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)):
        required_files.append(sam_fpath)

    stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat')
    index_str = qutils.index_to_str(index) if index is not None else ''

    reads_fpaths = qconfig.reads_fpaths
    correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    can_reuse = correct_chr_names is not None
    if not can_reuse and not reads_fpaths:
        return None, None, None
    if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)):
        if not alignment_only:
            if isfile(stats_fpath):
                logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
            elif isfile(bam_fpath):
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                       stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
                analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
            calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath)
        if isfile(stats_fpath) or alignment_only:
            return correct_chr_names, sam_fpath, bam_fpath

    logger.info('  ' + index_str + 'Pre-processing reads...')
    if is_non_empty_file(sam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing SAM-file: ' + sam_fpath)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    elif is_non_empty_file(bam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
        sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths:
        if is_reference:
            logger.info('  Running BWA for reference...')
        else:
            logger.info('  ' + index_str + 'Running BWA...')
        # use absolute paths because we will change workdir
        fpath = abspath(fpath)
        sam_fpath = abspath(sam_fpath)

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        bwa_index(fpath, err_fpath, logger)
        sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads)

        if len(sam_fpaths) > 1:
            merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, main_output_dir, max_threads, err_fpath)
        elif len(sam_fpaths) == 1:
            shutil.move(sam_fpaths[0], sam_fpath)
            sambamba_view(sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)

        logger.info('  ' + index_str + 'Done.')
        os.chdir(prev_dir)
        if not is_non_empty_file(sam_fpath):
            logger.error('  Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.')
            return None, None, None
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    elif not correct_chr_names or not is_non_empty_file(sam_fpath):
        return None, None, None
    if is_reference:
        logger.info('  Sorting SAM-file for reference...')
    else:
        logger.info('  ' + index_str + 'Sorting SAM-file...')

    if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath):
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
    else:
        correct_sam_fpath = join(output_dirpath, filename + '.correct.sam')  # write in output dir
        sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath)
        sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)

    qutils.assert_file_exists(bam_fpath, 'bam file')
    if not alignment_only:
        if isfile(stats_fpath):
            logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
        elif isfile(bam_fpath):
            qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                    stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
            analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
        calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath)
        if is_reference:
            logger.info('  Analysis for reference is finished.')
        else:
            logger.info('  ' + index_str + 'Analysis is finished.')
    return correct_chr_names, sam_fpath, bam_fpath
Пример #40
0
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir,
                         log_path, err_fpath):
    required_files = []
    bed_fpath, cov_fpath, physical_cov_fpath = None, None, None
    if main_ref_fpath:
        ref_name = qutils.name_from_fpath(main_ref_fpath)

        bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed')
        cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
        physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov')
        required_files = [bed_fpath, cov_fpath, physical_cov_fpath]

        if qconfig.no_sv:
            logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
            bed_fpath = None
        elif is_non_empty_file(bed_fpath):
            logger.info('  Using existing BED-file: ' + bed_fpath)
        elif not qconfig.forward_reads and not qconfig.interlaced_reads:
            if not qconfig.reference_sam and not qconfig.reference_bam:
                logger.info('  Will not search Structural Variations (needs paired-end reads)')
                bed_fpath = None
                qconfig.no_sv = True
        if qconfig.create_icarus_html:
            if is_non_empty_file(cov_fpath):
                is_correct_file = check_cov_file(cov_fpath)
                if is_correct_file:
                    logger.info('  Using existing reads coverage file: ' + cov_fpath)
            if is_non_empty_file(physical_cov_fpath):
                logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
        else:
            logger.info('  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)')
            cov_fpath = None
            physical_cov_fpath = None
        if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
                (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
            required_files = []

    n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1)
    max_threads_per_job = max(1, qconfig.max_threads // n_jobs)
    sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths)
    bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths)
    parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job,
                            sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)]
    if main_ref_fpath:
        parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                    max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True))
    correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs)
    qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)]
    qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)]
    add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath)
    save_reads(output_dir)
    if not main_ref_fpath:
        return None, None, None

    correct_chr_names = correct_chr_names[-1]
    sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1]
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath
    if not required_files:
        return bed_fpath, cov_fpath, physical_cov_fpath
    if not all([sam_fpath, bam_fpath]):
        logger.info('  Failed searching structural variations.')
        return None, None, None

    sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted'))
    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        if not is_non_empty_file(bam_sorted_fpath):
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger)
    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_lengths = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_lengths[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        ref_files = {}
        if meta_ref_fpaths:
            global ref_sam_fpaths
            for cur_ref_fpath in meta_ref_fpaths:
                cur_ref_name = qutils.name_from_fpath(cur_ref_fpath)
                ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam')
                ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath
                if is_non_empty_file(ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath))
                    ref_files[cur_ref_name] = None
                else:
                    ref_sam_file = open(ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        ref_sam_file.write(headers[0] + '\n')
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name:
                            ref_sam_file.write(h + '\n')
                    ref_sam_file.write(headers[-1] + '\n')
                    ref_files[cur_ref_name] = ref_sam_file
                    need_ref_splitting = True

        trivial_deletions_fpath = \
            search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting)
        if get_gridss_fpath() and isfile(get_gridss_fpath()):
            try:
                gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath)
                qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Пример #41
0
def save_result(result, report, fname, ref_fpath, genome_size):
    region_misassemblies = result['region_misassemblies']
    misassemblies_by_ref = result['misassemblies_by_ref']
    misassembled_contigs = result['misassembled_contigs']
    misassembled_bases = result['misassembled_bases']
    misassembly_internal_overlap = result['misassembly_internal_overlap']
    unaligned = result['unaligned']
    partially_unaligned = result['partially_unaligned']
    partially_unaligned_bases = result['partially_unaligned_bases']
    fully_unaligned_bases = result['fully_unaligned_bases']
    ambiguous_contigs = result['ambiguous_contigs']
    ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases']
    SNPs = result['SNPs']
    indels_list = result['indels_list']
    aligned_ref_bases = result['aligned_ref_bases']
    aligned_assembly_bases = result['aligned_assembly_bases']
    half_unaligned_with_misassembly = result['half_unaligned_with_misassembly']

    report.add_field(reporting.Fields.MISLOCAL,
                     region_misassemblies.count(Misassembly.LOCAL))
    report.add_field(
        reporting.Fields.MISASSEMBL,
        region_misassemblies.count(Misassembly.RELOCATION) +
        region_misassemblies.count(Misassembly.INVERSION) +
        region_misassemblies.count(Misassembly.TRANSLOCATION) +
        region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases)
    report.add_field(reporting.Fields.MISINTERNALOVERLAP,
                     misassembly_internal_overlap)
    if qconfig.bed:
        report.add_field(reporting.Fields.STRUCT_VARIATIONS,
                         region_misassemblies.count(Misassembly.MATCHED_SV))
    if qconfig.large_genome:
        report.add_field(reporting.Fields.POTENTIAL_MGE,
                         region_misassemblies.count(Misassembly.POTENTIAL_MGE))
    report.add_field(reporting.Fields.UNALIGNED,
                     '%d + %d part' % (unaligned, partially_unaligned))
    report.add_field(reporting.Fields.UNALIGNEDBASES,
                     (fully_unaligned_bases + partially_unaligned_bases))
    report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs)
    report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES,
                     ambiguous_contigs_extra_bases)
    report.add_field(reporting.Fields.MISMATCHES, SNPs)
    # different types of indels:
    if indels_list is not None:
        report.add_field(reporting.Fields.INDELS, len(indels_list))
        report.add_field(reporting.Fields.INDELSBASES, sum(indels_list))
        report.add_field(
            reporting.Fields.MIS_SHORT_INDELS,
            len([i for i in indels_list
                 if i <= qconfig.SHORT_INDEL_THRESHOLD]))
        report.add_field(
            reporting.Fields.MIS_LONG_INDELS,
            len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD]))

    if aligned_ref_bases:
        genome_fraction = aligned_ref_bases * 100.0 / genome_size
        duplication_ratio = float(
            aligned_assembly_bases + misassembly_internal_overlap +
            ambiguous_contigs_extra_bases) / aligned_ref_bases
        report.add_field(reporting.Fields.MAPPEDGENOME,
                         '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO,
                         '%.3f' % duplication_ratio)
        report.add_field(
            reporting.Fields.SUBSERROR,
            "%.2f" % (float(SNPs) * 100000.0 / float(aligned_assembly_bases)))
        report.add_field(
            reporting.Fields.INDELSERROR,
            "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) *
                      100000.0 / float(aligned_assembly_bases)))

    # for misassemblies report:
    report.add_field(
        reporting.Fields.MIS_ALL_EXTENSIVE,
        region_misassemblies.count(Misassembly.RELOCATION) +
        region_misassemblies.count(Misassembly.INVERSION) +
        region_misassemblies.count(Misassembly.TRANSLOCATION) +
        region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MIS_RELOCATION,
                     region_misassemblies.count(Misassembly.RELOCATION))
    report.add_field(reporting.Fields.MIS_TRANSLOCATION,
                     region_misassemblies.count(Misassembly.TRANSLOCATION))
    report.add_field(reporting.Fields.MIS_INVERTION,
                     region_misassemblies.count(Misassembly.INVERSION))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS,
                     len(misassembled_contigs))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases)
    report.add_field(reporting.Fields.MIS_LOCAL,
                     region_misassemblies.count(Misassembly.LOCAL))
    # special case for separating contig and scaffold misassemblies
    report.add_field(
        reporting.Fields.SCF_MIS_ALL_EXTENSIVE,
        region_misassemblies.count(Misassembly.SCF_RELOCATION) +
        region_misassemblies.count(Misassembly.SCF_INVERSION) +
        region_misassemblies.count(Misassembly.SCF_TRANSLOCATION) +
        region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.SCF_MIS_RELOCATION,
                     region_misassemblies.count(Misassembly.SCF_RELOCATION))
    report.add_field(reporting.Fields.SCF_MIS_TRANSLOCATION,
                     region_misassemblies.count(Misassembly.SCF_TRANSLOCATION))
    report.add_field(reporting.Fields.SCF_MIS_INVERTION,
                     region_misassemblies.count(Misassembly.SCF_INVERSION))
    report.add_field(
        reporting.Fields.CTG_MIS_ALL_EXTENSIVE,
        report.get_field(reporting.Fields.MIS_ALL_EXTENSIVE) -
        report.get_field(reporting.Fields.SCF_MIS_ALL_EXTENSIVE))
    report.add_field(
        reporting.Fields.CTG_MIS_RELOCATION,
        region_misassemblies.count(Misassembly.RELOCATION) -
        region_misassemblies.count(Misassembly.SCF_RELOCATION))
    report.add_field(
        reporting.Fields.CTG_MIS_TRANSLOCATION,
        region_misassemblies.count(Misassembly.TRANSLOCATION) -
        region_misassemblies.count(Misassembly.SCF_TRANSLOCATION))
    report.add_field(
        reporting.Fields.CTG_MIS_INVERTION,
        region_misassemblies.count(Misassembly.INVERSION) -
        region_misassemblies.count(Misassembly.SCF_INVERSION))

    if qconfig.is_combined_ref:
        report.add_field(
            reporting.Fields.MIS_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.SCF_MIS_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.CTG_MIS_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION) -
            region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS,
            region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
        report.add_field(
            reporting.Fields.POSSIBLE_MISASSEMBLIES,
            region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        all_references = sorted(
            list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        for ref_name in all_references:
            subreport = reporting.get(fname, ref_name=ref_name)
            ref_misassemblies = misassemblies_by_ref[ref_name]
            subreport.add_field(
                reporting.Fields.MIS_ALL_EXTENSIVE,
                ref_misassemblies.count(Misassembly.RELOCATION) +
                ref_misassemblies.count(Misassembly.INVERSION) +
                ref_misassemblies.count(Misassembly.TRANSLOCATION) +
                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(
                reporting.Fields.MIS_RELOCATION,
                ref_misassemblies.count(Misassembly.RELOCATION))
            subreport.add_field(
                reporting.Fields.MIS_TRANSLOCATION,
                ref_misassemblies.count(Misassembly.TRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_INVERTION,
                                ref_misassemblies.count(Misassembly.INVERSION))
            subreport.add_field(
                reporting.Fields.MIS_ISTRANSLOCATIONS,
                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_LOCAL,
                                ref_misassemblies.count(Misassembly.LOCAL))
            subreport.add_field(
                reporting.Fields.POSSIBLE_MISASSEMBLIES,
                ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
            subreport.add_field(
                reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS,
                ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
            if fname not in qconfig.dict_of_broken_scaffolds:
                subreport.add_field(
                    reporting.Fields.MIS_SCAFFOLDS_GAP,
                    ref_misassemblies.count(Misassembly.SCAFFOLD_GAP))
                subreport.add_field(
                    reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP,
                    ref_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP))
            if qconfig.check_for_fragmented_ref:
                subreport.add_field(
                    reporting.Fields.MIS_FRAGMENTED,
                    ref_misassemblies.count(Misassembly.FRAGMENTED))
    elif intergenomic_misassemblies_by_asm:
        label = qutils.label_from_fpath(fname)
        ref_name = qutils.name_from_fpath(ref_fpath)
        ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name]
        report.add_field(
            reporting.Fields.MIS_ISTRANSLOCATIONS,
            ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(
            reporting.Fields.POSSIBLE_MISASSEMBLIES,
            ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        report.add_field(
            reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS,
            ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
    if fname not in qconfig.dict_of_broken_scaffolds:
        report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP,
                         region_misassemblies.count(Misassembly.SCAFFOLD_GAP))
        report.add_field(
            reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP,
            region_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP))
    if qconfig.check_for_fragmented_ref:
        report.add_field(reporting.Fields.MIS_FRAGMENTED,
                         region_misassemblies.count(Misassembly.FRAGMENTED))
    # for unaligned report:
    report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned)
    report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH,
                     fully_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS,
                     partially_unaligned)
    report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH,
                     partially_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS,
                     half_unaligned_with_misassembly)
    return report
Пример #42
0
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a')
        elif downloaded_refs:
            logger.warning('Skipping ' + ref_fpath + ' because it'
                           ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!')
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error('Reference file ' + ref_fpath +
                         ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                         exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' + qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Пример #43
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
    reference_length = sum(ref_chr_lengths.values())
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()))

    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
                 ', LA50 = ' + str(la50) +
                 (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))])

    if json_output_dirpath:
        from quast_libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx',
                    assembly_lengths, json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists,
                        aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Пример #44
0
def main(args):
    check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' +
                  'Please, put QUAST in a different directory, then try again.\n', exit_code=3)

    if not args:
        qconfig.usage(meta=True)
        sys.exit(0)

    metaquast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args, is_metaquast=True)
    output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    html_report = qconfig.html_report
    test_mode = qconfig.test

    # Directories
    output_dirpath, _, _ = qutils.set_up_output_dir(
        output_dirpath, None, not output_dirpath,
        save_json=False)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    qconfig.set_max_threads(logger)
    qutils.logger = logger

    ########################################################################

    from quast_libs import reporting
    try:
        import imp
        imp.reload(reporting)
    except:
        reload(reporting)
    from quast_libs import plotter

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCES
    if ref_fpaths:
        logger.main_info()
        logger.main_info('Reference(s):')

        corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
            correct_meta_references(ref_fpaths, corrected_dirpath)

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')
    qconfig.no_check_meta = True
    assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels)
    if not assemblies:
        logger.error("None of the assembly files contains correct contigs. "
                     "Please, provide different files or decrease --min-contig threshold.")
        return 4

    # Running QUAST(s)
    quast_py_args += ['--meta']
    downloaded_refs = False

    # SEARCHING REFERENCES
    if not ref_fpaths:
        logger.main_info()
        if qconfig.max_references == 0:
            logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled")
        else:
            if qconfig.references_txt:
                logger.main_info("List of references was provided, starting to download reference genomes from NCBI...")
            else:
                logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database "
                        "and to download them from NCBI...")
            downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname)
            if not os.path.isdir(downloaded_dirpath):
                os.mkdir(downloaded_dirpath)
            corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
            ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt)
            if ref_fpaths:
                search_references_meta.is_quast_first_run = True
                if not qconfig.references_txt:
                    downloaded_refs = True
                logger.main_info()
                logger.main_info('Downloaded reference(s):')
                corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    correct_meta_references(ref_fpaths, corrected_dirpath)
            elif test_mode and not ref_fpaths:
                logger.error('Failed to download or setup SILVA 16S rRNA database for working without '
                             'references on metagenome datasets!', to_stderr=True, exit_with_code=4)

    if not ref_fpaths:
        # No references, running regular quast with MetaGenemark gene finder
        logger.main_info()
        logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder')
        _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True)
        exit(0)

    # Running combined reference
    combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name)

    reads_fpaths = []
    if qconfig.forward_reads:
        reads_fpaths.append(qconfig.forward_reads)
    if qconfig.reverse_reads:
        reads_fpaths.append(qconfig.reverse_reads)
    cov_fpath = qconfig.cov_fpath
    physical_cov_fpath = qconfig.phys_cov_fpath
    if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpaths:
        bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(combined_ref_fpath, contigs_fpaths, reads_fpaths, corrected_ref_fpaths,
                                                   os.path.join(combined_output_dirpath, qconfig.variation_dirname),
                                                   external_logger=logger, sam_fpath=qconfig.sam, bam_fpath=qconfig.bam, bed_fpath=qconfig.bed)
        qconfig.bed = bed_fpath

    if qconfig.bed:
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]
    if cov_fpath:
        quast_py_args += ['--cov']
        quast_py_args += [cov_fpath]
    if physical_cov_fpath:
        quast_py_args += ['--phys-cov']
        quast_py_args += [physical_cov_fpath]
    if qconfig.sam:
        quast_py_args += ['--sam']
        quast_py_args += [qconfig.sam]
    if qconfig.bam:
        quast_py_args += ['--bam']
        quast_py_args += [qconfig.bam]

    quast_py_args += ['--combined-ref']
    if qconfig.draw_plots or qconfig.html_report:
        if plotter.dict_color_and_ls:
            colors_and_ls = [plotter.dict_color_and_ls[asm.label] for asm in assemblies]
            quast_py_args += ['--colors']
            quast_py_args += [','.join([style[0] for style in colors_and_ls])]
            quast_py_args += ['--ls']
            quast_py_args += [','.join([style[1] for style in colors_and_ls])]
    run_name = 'for the combined reference'
    logger.main_info()
    logger.main_info('Starting quast.py ' + run_name + '...')
    total_num_notices = 0
    total_num_warnings = 0
    total_num_nf_errors = 0
    total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors)
    if qconfig.html_report:
        from quast_libs.html_saver import json_saver
        json_texts = []
    else:
        json_texts = None
    if qconfig.unique_mapping:
        ambiguity_opts = []
    else:
        ambiguity_opts = ["--ambiguity-usage", 'all']
    return_code, total_num_notifications, assemblies, labels = \
        _start_quast_main(quast_py_args + ambiguity_opts,
        assemblies=assemblies,
        reference_fpath=combined_ref_fpath,
        output_dirpath=combined_output_dirpath,
        num_notifications_tuple=total_num_notifications, is_first_run=True)

    if json_texts is not None:
        json_texts.append(json_saver.json_text)
    search_references_meta.is_quast_first_run = False

    genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats')
    genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt')
    if not os.path.exists(genome_info_fpath):
        logger.main_info('')
        if not downloaded_refs:
            msg = 'Try to restart MetaQUAST with another references.'
        else:
            msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.'
        logger.main_info('Failed aligning the contigs for all the references. ' + msg)
        logger.main_info('')
        cleanup(corrected_dirpath)
        logger.main_info('MetaQUAST finished.')
        return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)

    if downloaded_refs:
        logger.main_info()
        logger.main_info('Excluding downloaded references with low genome fraction from further analysis..')
        corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs)
        if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths:
            logger.main_info()
            logger.main_info('Filtered reference(s):')
            os.remove(combined_ref_fpath)
            contigs_analyzer.ref_labels_by_chromosomes = {}
            corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \
                correct_meta_references(corr_ref_fpaths, corrected_dirpath)
            run_name = 'for the corrected combined reference'
            logger.main_info()
            logger.main_info('Starting quast.py ' + run_name + '...')
            return_code, total_num_notifications, assemblies, labels = \
                _start_quast_main(quast_py_args + ambiguity_opts,
                assemblies=assemblies,
                reference_fpath=combined_ref_fpath,
                output_dirpath=combined_output_dirpath,
                num_notifications_tuple=total_num_notifications, is_first_run=True)
            if json_texts is not None:
                json_texts = json_texts[:-1]
                json_texts.append(json_saver.json_text)
        elif corr_ref_fpaths == ref_fpaths:
            logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.')
        else:
            logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.')

    if qconfig.calculate_read_support:
        calculate_ave_read_support(combined_output_dirpath, assemblies)

    for arg in args:
        if arg in ('-s', "--scaffolds"):
            quast_py_args.remove(arg)
    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold > qconfig.min_contig])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args = remove_from_quast_py_args(quast_py_args, '--contig-thresholds', qconfig.contig_thresholds)
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]
    quast_py_args.remove('--combined-ref')

    logger.main_info()
    logger.main_info('Partitioning contigs into bins aligned to each reference..')

    assemblies_by_reference, not_aligned_assemblies = partition_contigs(
        assemblies, corrected_ref_fpaths, corrected_dirpath,
        os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels)

    ref_names = []
    output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname)
    for ref_fpath, ref_assemblies in assemblies_by_reference:
        ref_name = qutils.name_from_fpath(ref_fpath)
        logger.main_info('')
        if not ref_assemblies:
            logger.main_info('No contigs were aligned to the reference ' + ref_name + ', skipping..')
        else:
            ref_names.append(ref_name)
            run_name = 'for the contigs aligned to ' + ref_name
            logger.main_info('Starting quast.py ' + run_name)

            return_code, total_num_notifications = _start_quast_main(quast_py_args,
                assemblies=ref_assemblies,
                reference_fpath=ref_fpath,
                output_dirpath=os.path.join(output_dirpath_per_ref, ref_name),
                num_notifications_tuple=total_num_notifications)
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    # Finally running for the contigs that has not been aligned to any reference
    no_unaligned_contigs = True
    for assembly in not_aligned_assemblies:
        if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0:
            no_unaligned_contigs = False
            break

    run_name = 'for the contigs not aligned anywhere'
    logger.main_info()
    if no_unaligned_contigs:
        logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)')
    else:
        logger.main_info('Starting quast.py ' + run_name + '...')

        return_code, total_num_notifications = _start_quast_main(quast_py_args,
            assemblies=not_aligned_assemblies,
            output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name),
            num_notifications_tuple=total_num_notifications)

        if return_code not in [0, 4]:
            logger.error('Error running quast.py for the contigs not aligned anywhere')
        elif return_code == 4:  # no unaligned contigs, i.e. everything aligned
            no_unaligned_contigs = True
        if not no_unaligned_contigs:
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    if ref_names:
        logger.print_timestamp()
        logger.main_info("Summarizing results...")

        summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir)
        if not os.path.isdir(summary_output_dirpath):
            os.makedirs(summary_output_dirpath)
        if html_report and json_texts:
            from quast_libs.html_saver import html_saver
            html_summary_report_fpath = html_saver.init_meta_report(output_dirpath)
        else:
            html_summary_report_fpath = None
        from quast_libs import create_meta_summary
        metrics_for_plots = reporting.Fields.main_metrics
        misassembl_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION,
                           reporting.Fields.MIS_ISTRANSLOCATIONS]
        if no_unaligned_contigs:
            full_ref_names = ref_names
        else:
            full_ref_names = ref_names + [qconfig.not_aligned_name]
        create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath,
                               output_dirpath_per_ref, metrics_for_plots, misassembl_metrics, full_ref_names)
        if html_report and json_texts:
            html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls, meta=True)
            if qconfig.create_icarus_html:
                icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names)
                logger.main_info('  Icarus (contig browser) is saved to %s' % icarus_html_fpath)
            html_saver.create_meta_report(output_dirpath, json_texts)

    cleanup(corrected_dirpath)
    logger.main_info('')
    logger.main_info('MetaQUAST finished.')
    return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
Пример #45
0
def run_processing_reads(main_ref_fpath,
                         meta_ref_fpaths,
                         ref_labels,
                         reads_fpaths,
                         output_dirpath,
                         res_path,
                         log_path,
                         err_path,
                         sam_fpath=None,
                         bam_fpath=None,
                         bed_fpath=None):
    ref_name = qutils.name_from_fpath(main_ref_fpath)

    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or os.path.join(output_dirpath,
                                              ref_name + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath,
                                            sam_fpath[:-4] + '.bam')
    sam_sorted_fpath = get_safe_fpath(output_dirpath,
                                      add_suffix(sam_fpath, 'sorted'))
    bam_sorted_fpath = get_safe_fpath(output_dirpath,
                                      add_suffix(bam_fpath, 'sorted'))

    bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed')
    cov_fpath = os.path.join(res_path, ref_name + '.cov')
    physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov')

    if qconfig.no_sv:
        logger.info(
            '  Will not search Structural Variations (--fast or --no-sv is specified)'
        )
        bed_fpath = None
    elif is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
    if qconfig.create_icarus_html:
        if is_non_empty_file(cov_fpath):
            is_correct_file = check_cov_file(cov_fpath)
            if is_correct_file:
                logger.info('  Using existing reads coverage file: ' +
                            cov_fpath)
        if is_non_empty_file(physical_cov_fpath):
            logger.info('  Using existing physical coverage file: ' +
                        physical_cov_fpath)
    else:
        logger.info(
            '  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)'
        )
        cov_fpath = None
        physical_cov_fpath = None
    if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
            (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
        return bed_fpath, cov_fpath, physical_cov_fpath

    logger.info('  ' + 'Pre-processing reads...')
    correct_chr_names = None
    if is_non_empty_file(sam_fpath):
        logger.info('  Using existing SAM-file: ' + sam_fpath)
        correct_chr_names = get_correct_names_for_chroms(
            output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths)
    elif is_non_empty_file(bam_fpath):
        logger.info('  Using existing BAM-file: ' + bam_fpath)
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', bam_fpath
        ],
                               stdout=open(sam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        correct_chr_names = get_correct_names_for_chroms(
            output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths)
    if not correct_chr_names and reads_fpaths:
        logger.info('  Running BWA...')
        # use absolute paths because we will change workdir
        sam_fpath = os.path.abspath(sam_fpath)
        abs_reads_fpaths = []
        for reads_fpath in reads_fpaths:
            abs_reads_fpaths.append(os.path.abspath(reads_fpath))

        if len(abs_reads_fpaths) != 2:
            logger.error(
                '  You should specify files with forward and reverse reads.')
            logger.info('  Failed searching structural variations.')
            return None, None, None

        if not qconfig.no_check:
            if not paired_reads_names_are_equal(reads_fpaths, logger):
                logger.error(
                    '  Read names are discordant, skipping reads analysis!')
                logger.info('  Failed searching structural variations.')
                return None, None, None

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath]
        if os.path.getsize(
                main_ref_fpath
        ) > 2 * 1024**3:  # if reference size bigger than 2GB
            cmd += ['-a', 'bwtsw']
        qutils.call_subprocess(cmd,
                               stdout=open(log_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)

        cmd = bwa_fpath('bwa') + ' mem -t ' + str(
            qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[
                0] + ' ' + abs_reads_fpaths[1]

        qutils.call_subprocess(shlex.split(cmd),
                               stdout=open(sam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        logger.info('  Done.')
        os.chdir(prev_dir)
        if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0:
            logger.error('  Failed running BWA for the reference. See ' +
                         log_path + ' for information.')
            logger.info('  Failed searching structural variations.')
            return None, None, None
    elif not correct_chr_names:
        logger.info('  Failed searching structural variations.')
        return None, None, None
    logger.info('  Sorting SAM-file...')
    if (is_non_empty_file(sam_sorted_fpath)
            and all_read_names_correct(sam_sorted_fpath)
        ) and is_non_empty_file(bam_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        correct_sam_fpath = os.path.join(output_dirpath, ref_name +
                                         '.sam.correct')  # write in output dir
        clean_read_names(sam_fpath, correct_sam_fpath)
        bam_fpath = os.path.join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped',
            '-S', correct_sam_fpath
        ],
                               stdout=open(bam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'sort', '-t',
            str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath
        ],
                               stderr=open(err_path, 'a'),
                               logger=logger)
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', bam_sorted_fpath
        ],
                               stdout=open(sam_sorted_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)

    if qconfig.create_icarus_html and (
            not is_non_empty_file(cov_fpath)
            or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(
            output_dirpath, main_ref_fpath, ref_name, bam_fpath,
            bam_sorted_fpath, log_path, err_path, cov_fpath,
            physical_cov_fpath, correct_chr_names)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_name_length = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_name_length[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        if meta_ref_fpaths:
            ref_files = {}
            for cur_ref_fpath in meta_ref_fpaths:
                ref = qutils.name_from_fpath(cur_ref_fpath)
                new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
                if is_non_empty_file(new_ref_sam_fpath):
                    logger.info(
                        '    Using existing split SAM-file for %s: %s' %
                        (ref, new_ref_sam_fpath))
                    ref_files[ref] = None
                else:
                    new_ref_sam_file = open(new_ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        new_ref_sam_file.write(headers[0] + '\n')
                    chrs = []
                    for h in (h for h in headers
                              if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[
                                seq_name] == ref:
                            new_ref_sam_file.write(h + '\n')
                            chrs.append(seq_name)
                    new_ref_sam_file.write(headers[-1] + '\n')
                    ref_files[ref] = new_ref_sam_file
                    need_ref_splitting = True
        deletions = []
        trivial_deletions_fpath = os.path.join(output_dirpath,
                                               qconfig.trivial_deletions_fname)
        logger.info(
            '  Looking for trivial deletions (long zero-covered fragments)...')
        need_trivial_deletions = True
        if os.path.exists(trivial_deletions_fpath):
            need_trivial_deletions = False
            logger.info('    Using existing file: ' + trivial_deletions_fpath)

        if need_trivial_deletions or need_ref_splitting:
            with open(sam_sorted_fpath) as sam_file:
                cur_deletion = None
                for line in sam_file:
                    mapping = Mapping.parse(line)
                    if mapping:
                        if mapping.ref == '*':
                            continue
                        # common case: continue current deletion (potential) on the same reference
                        if cur_deletion and cur_deletion.ref == mapping.ref:
                            if cur_deletion.next_bad is None:  # previous mapping was in region BEFORE 0-covered fragment
                                # just passed 0-covered fragment
                                if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP:
                                    cur_deletion.set_next_bad(mapping)
                                    if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                        cur_deletion.set_next_good(mapping)
                                        if cur_deletion.is_valid():
                                            deletions.append(cur_deletion)
                                        cur_deletion = QuastDeletion(
                                            mapping.ref).set_prev_good(mapping)
                                # continue region BEFORE 0-covered fragment
                                elif mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_prev_good(mapping)
                                else:
                                    cur_deletion.set_prev_bad(mapping)
                            else:  # previous mapping was in region AFTER 0-covered fragment
                                # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping
                                if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP:
                                    if cur_deletion.is_valid(
                                    ):  # add previous fragment's deletion if needed
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(
                                        mapping.ref).set_prev_bad(
                                            position=cur_deletion.next_bad_end)
                                # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above)
                                if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_next_good(mapping)
                                    if cur_deletion.is_valid():
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(
                                        mapping.ref).set_prev_good(mapping)
                                else:
                                    cur_deletion.set_next_bad_end(mapping)
                        # special case: just started or just switched to the next reference
                        else:
                            if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                                cur_deletion.set_next_good(
                                    position=seq_name_length[cur_deletion.ref])
                                if cur_deletion.is_valid():
                                    deletions.append(cur_deletion)
                            cur_deletion = QuastDeletion(
                                mapping.ref).set_prev_good(mapping)

                        if need_ref_splitting:
                            cur_ref = ref_labels[mapping.ref]
                            if mapping.ref_next.strip(
                            ) == '=' or cur_ref == ref_labels[
                                    mapping.ref_next]:
                                if ref_files[cur_ref] is not None:
                                    ref_files[cur_ref].write(line)
                if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                    cur_deletion.set_next_good(
                        position=seq_name_length[cur_deletion.ref])
                    if cur_deletion.is_valid():
                        deletions.append(cur_deletion)
            if need_ref_splitting:
                for ref_handler in ref_files.values():
                    if ref_handler is not None:
                        ref_handler.close()
            if need_trivial_deletions:
                logger.info('  Trivial deletions: %d found' % len(deletions))
                logger.info('    Saving to: ' + trivial_deletions_fpath)
                with open(trivial_deletions_fpath, 'w') as f:
                    for deletion in deletions:
                        f.write(str(deletion) + '\n')

        if get_manta_fpath() and isfile(get_manta_fpath()):
            try:
                manta_sv_fpath = search_sv_with_manta(main_ref_fpath,
                                                      meta_ref_fpaths,
                                                      output_dirpath, err_path)
                qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath],
                                 bed_fpath)
            except:
                pass
        if os.path.exists(
                trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info(
            '  Coverage distribution along the reference genome is in ' +
            cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Пример #46
0
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels,
                contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None,
                stdout_pattern=None, features_data=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None):
    chr_names = []
    if chromosomes_length and assemblies:
        chr_to_aligned_blocks = OrderedDict()
        chr_names = list(chromosomes_length.keys())
        for assembly in assemblies.assemblies:
            chr_to_aligned_blocks[assembly.label] = defaultdict(list)
            similar_correct = 0
            similar_misassembled = 0

            for align in assembly.alignments:
                chr_to_aligned_blocks[assembly.label][align.ref_name].append(align)
                if align.similar:
                    if align.misassembled:
                        similar_misassembled += 1
                    else:
                        similar_correct += 1
            report = reporting.get(assembly.fpath)
            report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct)
            report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled)

    main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname)
    output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)

    chr_full_names, contig_names_by_refs = group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath)

    cov_data, not_covered, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs)
    physical_cov_data, not_covered, physical_max_depth = parse_cov_fpath(physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs)

    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    nx_marks = [reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75]

    assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks)

    ref_contigs_dict = {}
    chr_lengths_dict = {}

    ref_data = 'var references_by_id = {};\n'
    chr_names_by_id = dict((chrom, str(i)) for i, chrom in enumerate(chr_names))
    for chrom, i in chr_names_by_id.items():
        ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n'
    for i, chr in enumerate(chr_full_names):
        if contig_names_by_refs:
            ref_contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
        elif len(chr_full_names) == 1:
            ref_contigs = chr_names
        else:
            ref_contigs = [chr]
        ref_contigs_dict[chr] = ref_contigs
        chr_lengths_dict[chr] = [0] + [chromosomes_length[contig] for contig in ref_contigs]

    num_misassemblies = defaultdict(int)
    aligned_bases_by_chr = defaultdict(list)
    aligned_assemblies = defaultdict(set)
    for i, chr in enumerate(chr_full_names):
        ref_contigs = ref_contigs_dict[chr]
        chr_lengths = chr_lengths_dict[chr]
        chr_size = sum([chromosomes_length[contig] for contig in ref_contigs])
        chr_sizes[chr] = chr_size
        num_contigs[chr] = len(ref_contigs)
        data_str = []
        data_str.append('var chromosomes_len = {};')
        for ref_contig in ref_contigs:
            l = chromosomes_length[ref_contig]
            data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';')
            aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig])

        cov_data_str = format_cov_data(cov_data, max_depth, chr, 'coverage_data', 'reads_max_depth') if cov_data else None
        physical_cov_data_str = format_cov_data(physical_cov_data, physical_max_depth, chr, 'physical_coverage_data', 'physical_max_depth') \
            if physical_cov_data else None

        alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \
            prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels,
                                               contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
                                               cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str,
                                               contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path)
        ref_name = qutils.name_from_fpath(ref_fpath)
        save_alignment_data_for_one_ref(chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors,
                                        ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data,
                                        contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data)

    contigs_sizes_str, too_many_contigs = get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels,
                                                           contig_names_by_refs, chr_names, chr_full_names)
    all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str
    save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data)

    icarus_links = defaultdict(list)
    if len(chr_full_names) > 1:
        chr_link = qconfig.icarus_html_fname
        icarus_links["links"].append(chr_link)
        icarus_links["links_names"].append(qconfig.icarus_link)

    main_menu_template_fpath = html_saver.get_real_path(qconfig.icarus_menu_template_fname)
    main_data_dict = dict()

    labels = [qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths]
    main_data_dict['assemblies'] = labels
    html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies')

    contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname)
    main_data_dict['contig_size_html'] = contig_size_browser_fpath
    html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html')
    if not chr_names:
        icarus_links["links"].append(contig_size_browser_fpath)
        icarus_links["links_names"].append(qconfig.icarus_link)

    if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref):
        main_data_dict['table_references'] = {'references': []}
        num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names]
        is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
        if is_unaligned_asm_exists:
            main_data_dict['table_references']['th_assemblies'] = True
        for chr in sorted(chr_full_names):
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths,
                                                                                contig_names_by_refs, one_chromosome=len(chr_full_names) == 1)
            reference_dict = dict()
            reference_dict['chr_link'] = chr_link
            reference_dict['tooltip'] = tooltip
            reference_dict['chr_name'] = os.path.basename(chr_name)
            reference_dict['num_contigs'] = str(num_contigs[chr])
            reference_dict['chr_size'] = format_long_numbers(chr_size)
            if is_unaligned_asm_exists:
                reference_dict['num_assemblies'] = str(len(aligned_assemblies[chr]))
            reference_dict['chr_gf'] = '%.3f' % chr_genome
            reference_dict['num_misassemblies'] = str(num_misassemblies[chr])
            main_data_dict['table_references']['references'].append(reference_dict)
        html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False)
    else:
        if chr_full_names:
            chr = chr_full_names[0]
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths,
                                                                                contig_names_by_refs, one_chromosome=True)
            main_data_dict['one_reference'] = dict()
            main_data_dict['one_reference']['alignment_link'] = chr_link
            main_data_dict['one_reference']['ref_fpath'] = os.path.basename(ref_fpath)
            main_data_dict['one_reference']['ref_fragments'] = str(num_contigs[chr])
            main_data_dict['one_reference']['ref_size'] = format_long_numbers(chr_size)
            main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome
            main_data_dict['one_reference']['num_misassemblies'] = str(num_misassemblies[chr])
            icarus_links["links"].append(chr_link)
            icarus_links["links_names"].append(qconfig.icarus_link)
            html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False)
    html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict)

    html_saver.save_icarus_links(output_dirpath, icarus_links)
    if json_output_dir:
        json_saver.save_icarus_links(json_output_dir, icarus_links)

    return main_menu_fpath
Пример #47
0
def js_data_gen(assemblies,
                contigs_fpaths,
                chromosomes_length,
                output_dirpath,
                structures_by_labels,
                contigs_by_assemblies,
                ambiguity_alignments_by_labels=None,
                contig_names_by_refs=None,
                ref_fpath=None,
                stdout_pattern=None,
                features_data=None,
                gc_fpath=None,
                cov_fpath=None,
                physical_cov_fpath=None,
                json_output_dir=None):
    chr_names = []
    if chromosomes_length and assemblies:
        chr_to_aligned_blocks = OrderedDict()
        chr_names = list(chromosomes_length.keys())
        for assembly in assemblies.assemblies:
            chr_to_aligned_blocks[assembly.label] = defaultdict(list)
            similar_correct = 0
            similar_misassembled = 0

            for align in assembly.alignments:
                chr_to_aligned_blocks[assembly.label][align.ref_name].append(
                    align)
                if align.similar:
                    if align.misassembled:
                        similar_misassembled += 1
                    else:
                        similar_correct += 1
            report = reporting.get(assembly.fpath)
            report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct)
            report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS,
                             similar_misassembled)

    main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname)
    output_all_files_dir_path = os.path.join(output_dirpath,
                                             qconfig.icarus_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)

    chr_full_names, contig_names_by_refs = group_references(
        chr_names, contig_names_by_refs, chromosomes_length, ref_fpath)

    cov_data, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names,
                                          contig_names_by_refs)
    physical_cov_data, physical_max_depth = parse_cov_fpath(
        physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs)
    gc_data, max_gc = parse_cov_fpath(gc_fpath, chr_names, chr_full_names,
                                      contig_names_by_refs)

    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    nx_marks = [
        reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50,
        reporting.Fields.NG75
    ]

    assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(
        contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks)

    ref_contigs_dict = {}
    chr_lengths_dict = {}

    ref_data = 'var references_by_id = {};\n'
    chr_names_by_id = dict(
        (chrom, str(i)) for i, chrom in enumerate(chr_names))
    for chrom, i in chr_names_by_id.items():
        ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n'
    for i, chr in enumerate(chr_full_names):
        if contig_names_by_refs:
            ref_contigs = [
                contig for contig in chr_names
                if contig_names_by_refs[contig] == chr
            ]
        elif len(chr_full_names) == 1:
            ref_contigs = chr_names
        else:
            ref_contigs = [chr]
        ref_contigs_dict[chr] = ref_contigs
        chr_lengths_dict[chr] = [0] + [
            chromosomes_length[contig] for contig in ref_contigs
        ]

    num_misassemblies = defaultdict(int)
    aligned_bases_by_chr = defaultdict(list)
    aligned_assemblies = defaultdict(set)
    for i, chr in enumerate(chr_full_names):
        ref_contigs = ref_contigs_dict[chr]
        chr_lengths = chr_lengths_dict[chr]
        chr_size = sum([chromosomes_length[contig] for contig in ref_contigs])
        chr_sizes[chr] = chr_size
        num_contigs[chr] = len(ref_contigs)
        data_str = []
        data_str.append('var chromosomes_len = {};')
        for ref_contig in ref_contigs:
            l = chromosomes_length[ref_contig]
            data_str.append('chromosomes_len["' + ref_contig + '"] = ' +
                            str(l) + ';')
            aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig])

        cov_data_str = format_cov_data(chr, cov_data, 'coverage_data',
                                       max_depth,
                                       'reads_max_depth') if cov_data else None
        physical_cov_data_str = format_cov_data(chr, physical_cov_data, 'physical_coverage_data', physical_max_depth, 'physical_max_depth') \
            if physical_cov_data else None
        gc_data_str = format_cov_data(chr, gc_data, 'gc_data', 100,
                                      'max_gc') if gc_data else None

        alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \
            prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels,
                                               contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
                                               cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, gc_data_str=gc_data_str,
                                               contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path)
        ref_name = qutils.name_from_fpath(ref_fpath)
        save_alignment_data_for_one_ref(
            chr,
            ref_contigs,
            ref_name,
            json_output_dir,
            alignment_viewer_fpath,
            ref_data_str,
            ms_selectors,
            ref_data=ref_data,
            features_data=features_data,
            assemblies_data=assemblies_data,
            contigs_structure_str=contigs_structure_str,
            additional_assemblies_data=additional_assemblies_data)

    contigs_sizes_str, too_many_contigs = get_contigs_data(
        contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels,
        contig_names_by_refs, chr_names, chr_full_names)
    all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str
    save_contig_size_html(output_all_files_dir_path, json_output_dir,
                          too_many_contigs, all_data)

    icarus_links = defaultdict(list)
    if len(chr_full_names) > 1:
        chr_link = qconfig.icarus_html_fname
        icarus_links["links"].append(chr_link)
        icarus_links["links_names"].append(qconfig.icarus_link)

    main_menu_template_fpath = html_saver.get_real_path(
        qconfig.icarus_menu_template_fname)
    main_data_dict = dict()

    labels = [
        qconfig.assembly_labels_by_fpath[contigs_fpath]
        for contigs_fpath in contigs_fpaths
    ]
    main_data_dict['assemblies'] = labels
    html_saver.save_icarus_data(json_output_dir, ', '.join(labels),
                                'assemblies')

    contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname,
                                             qconfig.contig_size_viewer_fname)
    main_data_dict['contig_size_html'] = contig_size_browser_fpath
    html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath,
                                'contig_size_html')
    if not chr_names:
        icarus_links["links"].append(contig_size_browser_fpath)
        icarus_links["links_names"].append(qconfig.icarus_link)

    if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref):
        main_data_dict['table_references'] = {'references': []}
        num_aligned_assemblies = [
            len(aligned_assemblies[chr]) for chr in chr_full_names
        ]
        is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
        if is_unaligned_asm_exists:
            main_data_dict['table_references']['th_assemblies'] = True
        for chr in sorted(chr_full_names, key=natural_sort):
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(
                chr,
                aligned_bases_by_chr,
                chr_sizes,
                contigs_fpaths,
                contig_names_by_refs,
                one_chromosome=len(chr_full_names) == 1)
            reference_dict = dict()
            reference_dict['chr_link'] = chr_link
            reference_dict['tooltip'] = tooltip
            reference_dict['chr_name'] = os.path.basename(chr_name)
            reference_dict['num_contigs'] = str(num_contigs[chr])
            reference_dict['chr_size'] = format_long_numbers(chr_size)
            if is_unaligned_asm_exists:
                reference_dict['num_assemblies'] = str(
                    len(aligned_assemblies[chr]))
            reference_dict['chr_gf'] = '%.3f' % chr_genome
            reference_dict['num_misassemblies'] = str(num_misassemblies[chr])
            main_data_dict['table_references']['references'].append(
                reference_dict)
        html_saver.save_icarus_data(json_output_dir,
                                    main_data_dict['table_references'],
                                    'table_references',
                                    as_text=False)
    else:
        if chr_full_names:
            chr = chr_full_names[0]
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(
                chr,
                aligned_bases_by_chr,
                chr_sizes,
                contigs_fpaths,
                contig_names_by_refs,
                one_chromosome=True)
            main_data_dict['one_reference'] = dict()
            main_data_dict['one_reference']['alignment_link'] = chr_link
            main_data_dict['one_reference']['ref_fpath'] = os.path.basename(
                ref_fpath)
            main_data_dict['one_reference']['ref_fragments'] = str(
                num_contigs[chr])
            main_data_dict['one_reference']['ref_size'] = format_long_numbers(
                chr_size)
            main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome
            main_data_dict['one_reference']['num_misassemblies'] = str(
                num_misassemblies[chr])
            icarus_links["links"].append(chr_link)
            icarus_links["links_names"].append(qconfig.icarus_link)
            html_saver.save_icarus_data(json_output_dir,
                                        main_data_dict['one_reference'],
                                        'menu_reference',
                                        as_text=False)
    html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath,
                                main_data_dict)
    html_saver.save_icarus_links(output_dirpath, icarus_links)

    return main_menu_fpath
Пример #48
0
def align_and_analyze(is_cyclic,
                      index,
                      contigs_fpath,
                      output_dirpath,
                      ref_fpath,
                      reference_chromosomes,
                      ns_by_chromosomes,
                      old_contigs_fpath,
                      bed_fpath,
                      threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stdout')
        log_err_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stderr')
        icarus_out_fpath = join(
            output_dirpath,
            qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.mis_contigs.info')
        unaligned_info_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = [
        'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous',
        'Best_group'
    ]
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' +
                    log_out_fpath + ' and ' + os.path.basename(log_err_fpath) +
                    '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(
        out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath,
                           contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error(
                    '  ' + qutils.index_to_str(index) +
                    'Failed aligning contigs ' +
                    qutils.label_from_fpath(contigs_fpath) +
                    ' to the reference (non-zero exit code). ' +
                    ('Run with the --debug flag to see additional information.'
                     if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Alignment failed for ' +
                    contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Alignment failed for ' + '\'' + assembly_label +
                            '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Nothing aligned for ' +
                    contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Nothing aligned for ' + '\'' + assembly_label +
                            '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n')  # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f,
                         misassembly_f=misassembly_f,
                         coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(
        ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {
        'SNPs': total_indels_info.mismatches,
        'indels_list': total_indels_info.indels_list,
        'total_aligned_bases': total_aligned_bases
    }
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath,
                           total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq)
                 for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(
            join(output_dirpath,
                 qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'),
            fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(
            output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(
            output_dirpath,
            qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' +
                     qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(
                                r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(
                                    contig)[0][0]
                                contig_cov = len_cov_pattern.findall(
                                    contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' +
                                                           str(aligned_len) +
                                                           '\t' + contig_cov +
                                                           '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Пример #49
0
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None,
                      index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'):
    filename = qutils.name_from_fpath(fpath)
    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam')
    if using_reads != 'all':
        sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam')
        bam_fpath = sam_fpath.replace('.sam', '.bam')
    if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)):
        required_files.append(sam_fpath)

    stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat')
    index_str = qutils.index_to_str(index) if index is not None else ''

    reads_fpaths = qconfig.reads_fpaths
    correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    can_reuse = correct_chr_names is not None
    if not can_reuse and not reads_fpaths:
        return None, None, None
    if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)):
        if not alignment_only:
            if isfile(stats_fpath):
                logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
            elif isfile(bam_fpath):
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                       stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
                analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
        if isfile(stats_fpath) or alignment_only:
            return correct_chr_names, sam_fpath, bam_fpath

    logger.info('  ' + index_str + 'Pre-processing reads...')
    if is_non_empty_file(sam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing SAM-file: ' + sam_fpath)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    elif is_non_empty_file(bam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
        sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths:
        if is_reference:
            logger.info('  Running BWA for reference...')
        else:
            logger.info('  ' + index_str + 'Running BWA...')
        # use absolute paths because we will change workdir
        fpath = abspath(fpath)
        sam_fpath = abspath(sam_fpath)

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        bwa_index(fpath, err_fpath, logger)
        sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads)

        if len(sam_fpaths) > 1:
            merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath)
        elif len(sam_fpaths) == 1:
            shutil.move(sam_fpaths[0], sam_fpath)
            tmp_bam_fpath = sam_fpaths[0].replace('.sam', '.bam')
            if is_non_empty_file(tmp_bam_fpath):
                shutil.move(tmp_bam_fpath, bam_fpath)

        logger.info('  ' + index_str + 'Done.')
        os.chdir(prev_dir)
        if not is_non_empty_file(sam_fpath):
            logger.error('  Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.')
            return None, None, None
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)

    elif not correct_chr_names or not is_non_empty_file(sam_fpath):
        return None, None, None
    if is_reference:
        logger.info('  Sorting SAM-file for reference...')
    else:
        logger.info('  ' + index_str + 'Sorting SAM-file...')

    if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath):
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
    else:
        correct_sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.correct.sam')  # write in output dir
        sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath)
        sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)

    qutils.assert_file_exists(bam_fpath, 'bam file')
    if not alignment_only:
        if isfile(stats_fpath):
            logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
        elif isfile(bam_fpath):
            qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                    stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
            analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
        if is_reference:
            logger.info('  Analysis for reference is finished.')
        else:
            logger.info('  ' + index_str + 'Analysis is finished.')
    return correct_chr_names, sam_fpath, bam_fpath
Пример #50
0
def main(args):
    check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' +
                  'Please, put QUAST in a different directory, then try again.\n', exit_code=3)

    if not args:
        qconfig.usage(stream=sys.stderr)
        sys.exit(1)

    metaquast_path = [os.path.realpath(__file__)]
    quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args)
    output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels
    html_report = qconfig.html_report
    test_mode = qconfig.test

    # Directories
    output_dirpath, _, _ = qutils.set_up_output_dir(
        output_dirpath, None, not output_dirpath,
        save_json=False)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    qconfig.set_max_threads(logger)
    qutils.logger = logger

    ########################################################################

    from quast_libs import reporting
    try:
        import importlib
        importlib.reload(reporting)
    except (ImportError, AttributeError):
        reload(reporting)
    from quast_libs import plotter

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCES
    if ref_fpaths:
        logger.main_info()
        logger.main_info('Reference(s):')

        corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
            correct_meta_references(ref_fpaths, corrected_dirpath)

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')
    qconfig.no_check_meta = True
    assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels)
    if not assemblies:
        logger.error("None of the assembly files contains correct contigs. "
                     "Please, provide different files or decrease --min-contig threshold.")
        return 4

    # Running QUAST(s)
    if qconfig.gene_finding:
        quast_py_args += ['--mgm']
    if qconfig.min_IDY is None: # special case: user not specified min-IDY, so we need to use MetaQUAST default value
        quast_py_args += ['--min-identity', str(qconfig.META_MIN_IDY)]

    if qconfig.reuse_combined_alignments:
        reuse_combined_alignments = True
    else:
        reuse_combined_alignments = False

    downloaded_refs = False

    # SEARCHING REFERENCES
    if not ref_fpaths:
        logger.main_info()
        if qconfig.max_references == 0:
            logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled")
        else:
            if qconfig.references_txt:
                logger.main_info("List of references was provided, starting to download reference genomes from NCBI...")
            else:
                logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database "
                        "and to download them from NCBI...")
            downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname)
            if not os.path.isdir(downloaded_dirpath):
                os.mkdir(downloaded_dirpath)
            corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)
            ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt)
            if ref_fpaths:
                search_references_meta.is_quast_first_run = True
                if not qconfig.references_txt:
                    downloaded_refs = True
                logger.main_info()
                logger.main_info('Downloaded reference(s):')
                corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=True)
            elif test_mode and not ref_fpaths:
                logger.error('Failed to download or setup SILVA 16S rRNA database for working without '
                             'references on metagenome datasets!', to_stderr=True, exit_with_code=4)

    if not ref_fpaths:
        # No references, running regular quast with MetaGenemark gene finder
        logger.main_info()
        logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder')
        assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in contigs_fpaths]
        _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True)
        exit(0)

    # Running combined reference
    combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name)
    qconfig.reference = combined_ref_fpath

    if qconfig.bed:
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]

    quast_py_args += ['--combined-ref']
    if qconfig.draw_plots or qconfig.html_report:
        if plotter_data.dict_color_and_ls:
            colors_and_ls = [plotter_data.dict_color_and_ls[asm.label] for asm in assemblies]
            quast_py_args += ['--colors']
            quast_py_args += [','.join([style[0] for style in colors_and_ls])]
            quast_py_args += ['--ls']
            quast_py_args += [','.join([style[1] for style in colors_and_ls])]
    run_name = 'for the combined reference'
    logger.main_info()
    logger.main_info('Starting quast.py ' + run_name + '...')
    total_num_notices = 0
    total_num_warnings = 0
    total_num_nf_errors = 0
    total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors)
    if qconfig.html_report:
        from quast_libs.html_saver import json_saver
        json_texts = []
    else:
        json_texts = None
    if qconfig.unique_mapping:
        ambiguity_opts = []
    else:
        ambiguity_opts = ["--ambiguity-usage", 'all']
    return_code, total_num_notifications = \
        _start_quast_main(quast_py_args + ambiguity_opts,
        labels=labels,
        assemblies=assemblies,
        reference_fpath=combined_ref_fpath,
        output_dirpath=combined_output_dirpath,
        num_notifications_tuple=total_num_notifications,
        is_combined_ref=True)

    if json_texts is not None:
        json_texts.append(json_saver.json_text)
    search_references_meta.is_quast_first_run = False

    genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats')
    genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt')
    if not os.path.exists(genome_info_fpath):
        logger.main_info('')
        if not downloaded_refs:
            msg = 'Try to restart MetaQUAST with another references.'
        else:
            msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.'
        logger.main_info('Failed aligning the contigs for all the references. ' + msg)
        logger.main_info('')
        cleanup(corrected_dirpath)
        logger.main_info('MetaQUAST finished.')
        return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)

    if downloaded_refs and return_code == 0:
        logger.main_info()
        logger.main_info('Excluding downloaded references with low genome fraction from further analysis..')
        corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs)
        if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths:
            logger.main_info()
            logger.main_info('Filtered reference(s):')
            os.remove(combined_ref_fpath)
            contigs_analyzer.ref_labels_by_chromosomes = OrderedDict()
            corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \
                correct_meta_references(corr_ref_fpaths, corrected_dirpath)
            assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels)
            run_name = 'for the corrected combined reference'
            logger.main_info()
            logger.main_info('Starting quast.py ' + run_name + '...')
            return_code, total_num_notifications = \
                _start_quast_main(quast_py_args + ambiguity_opts,
                labels=labels,
                assemblies=assemblies,
                reference_fpath=combined_ref_fpath,
                output_dirpath=combined_output_dirpath,
                num_notifications_tuple=total_num_notifications,
                is_combined_ref=True)
            if json_texts is not None:
                json_texts = json_texts[:-1]
                json_texts.append(json_saver.json_text)
        elif corr_ref_fpaths == ref_fpaths:
            logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.')
        else:
            logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.')

    if return_code != 0:
        logger.main_info('MetaQUAST finished.')
        return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)

    if qconfig.calculate_read_support:
        calculate_ave_read_support(combined_output_dirpath, assemblies)

    prepare_regular_quast_args(quast_py_args, combined_output_dirpath, reuse_combined_alignments)
    logger.main_info()
    logger.main_info('Partitioning contigs into bins aligned to each reference..')

    assemblies_by_reference, not_aligned_assemblies = partition_contigs(
        assemblies, corrected_ref_fpaths, corrected_dirpath,
        os.path.join(combined_output_dirpath, qconfig.detailed_contigs_reports_dirname, 'alignments_%s.tsv'), labels)

    output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname)
    if not qconfig.memory_efficient and \
                    len(assemblies_by_reference) > len(assemblies) and len(assemblies) < qconfig.max_threads:
        logger.main_info()
        logger.main_info('Run QUAST on different references in parallel..')
        threads_per_ref = max(1, qconfig.max_threads // len(assemblies_by_reference))
        quast_py_args += ['--memory-efficient']
        quast_py_args += ['-t', str(threads_per_ref)]

        num_notifications = (0, 0, 0)
        parallel_run_args = [(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, num_notifications, True)
                             for ref_fpath, ref_assemblies in assemblies_by_reference]
        ref_names, ref_json_texts, ref_notifications = \
            run_parallel(_run_quast_per_ref, parallel_run_args, qconfig.max_threads, filter_results=True)
        per_ref_num_notifications = list(map(sum, zip(*ref_notifications)))
        total_num_notifications = list(map(sum, zip(total_num_notifications, per_ref_num_notifications)))
        if json_texts is not None:
            json_texts.extend(ref_json_texts)
        quast_py_args.remove('--memory-efficient')
        quast_py_args = remove_from_quast_py_args(quast_py_args, '-t', str(threads_per_ref))
    else:
        ref_names = []
        for ref_fpath, ref_assemblies in assemblies_by_reference:
            ref_name, json_text, total_num_notifications = \
                _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications)
            if not ref_name:
                continue
            ref_names.append(ref_name)
            if json_texts is not None:
                json_texts.append(json_text)

    # Finally running for the contigs that has not been aligned to any reference
    no_unaligned_contigs = True
    for assembly in not_aligned_assemblies:
        if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0:
            no_unaligned_contigs = False
            break

    run_name = 'for the contigs not aligned anywhere'
    logger.main_info()
    if no_unaligned_contigs:
        logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)')
    else:
        logger.main_info('Starting quast.py ' + run_name + '... (logging to ' +
                        os.path.join(output_dirpath, qconfig.not_aligned_name, qconfig.LOGGER_DEFAULT_NAME + '.log)'))

        return_code, total_num_notifications = _start_quast_main(quast_py_args + ['-t', str(qconfig.max_threads)],
            assemblies=not_aligned_assemblies,
            output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name),
            num_notifications_tuple=total_num_notifications)

        if return_code not in [0, 4]:
            logger.error('Error running quast.py for the contigs not aligned anywhere')
        elif return_code == 4:  # no unaligned contigs, i.e. everything aligned
            no_unaligned_contigs = True
        if not no_unaligned_contigs:
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    if ref_names:
        logger.print_timestamp()
        logger.main_info("Summarizing results...")

        summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir)
        if not os.path.isdir(summary_output_dirpath):
            os.makedirs(summary_output_dirpath)
        if html_report and json_texts:
            from quast_libs.html_saver import html_saver
            html_summary_report_fpath = html_saver.init_meta_report(output_dirpath)
        else:
            html_summary_report_fpath = None
        from quast_libs import create_meta_summary
        metrics_for_plots = reporting.Fields.main_metrics
        misassembly_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION,
                              reporting.Fields.MIS_ISTRANSLOCATIONS]
        if no_unaligned_contigs:
            full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths]
        else:
            full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] + [qconfig.not_aligned_name]
        create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath,
                               output_dirpath_per_ref, metrics_for_plots, misassembly_metrics, full_ref_names)
        if html_report and json_texts:
            html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls, meta=True)
            if qconfig.create_icarus_html:
                icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names)
                logger.main_info('  Icarus (contig browser) is saved to %s' % icarus_html_fpath)
            html_saver.create_meta_report(output_dirpath, json_texts)

    cleanup(corrected_dirpath)
    logger.main_info('')
    logger.main_info('MetaQUAST finished.')
    return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
Пример #51
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys()))

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.main_info('Done.')
    return [genes_container, operons_container]
Пример #52
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
    reference_length = sum(ref_chr_lengths.values())
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
                 ', LA50 = ' + str(la50) +
                 (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))])

    if json_output_dirpath:
        from quast_libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    import plotter
    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx',
                    assembly_lengths, json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists,
                        aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Пример #53
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_lens = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        ref_lens[name] = len(seq)
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in ref_lens.items():
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    # if qconfig.large_genome:
    #     log_out_f.write('Analyzing large blocks...\n')
    #     large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null'
    #     ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'),
    #                                coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w'))
    #     min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
    #     result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null',
    #                                   aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0])
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Пример #54
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths,
       detailed_contigs_reports_dirpath, genome_stats_dirpath):

    coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname)
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        coords_dirpath = os.path.join(coords_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(ref_fpath)

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt')
    res_file = open(result_fpath, 'w')

    containers = []
    for feature, feature_fpath in features_dict.items():
        containers.append(FeatureContainer([feature_fpath], feature))
    if not features_dict:
        logger.notice('No file with genomic features were provided. '
                      'Use the --features option if you want to specify it.\n', indent='  ')
    if operons_fpaths:
        containers.append(FeatureContainer(operons_fpaths, 'operon'))
    else:
        logger.notice('No file with operons were provided. '
                      'Use the -O option if you want to specify it.', indent='  ')
    for container in containers:
        if not container.fpaths:
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent='  ')
            res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"')
            res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys()))

    ref_genes_num, ref_operons_num = None, None
    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        genomic_features = 0
        for container in containers:
            if container.kind == 'operon':
                ref_operons_num = len(container.region_list)
                report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list))
            else:
                genomic_features += len(container.region_list)
        if genomic_features:
            ref_genes_num = genomic_features
            report.add_field(reporting.Fields.REF_GENES, genomic_features)

    # for cumulative plots:
    files_features_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_unsorted_features_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}
    files_unsorted_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)

    parallel_run_args = [(contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                          reference_chromosomes, ns_by_chromosomes, containers)
                        for index, contigs_fpath in enumerate(aligned_contigs_fpaths)]
    ref_lengths, results_genes_operons_tuples = run_parallel(process_single_file, parallel_run_args, n_jobs, filter_results=True)
    num_nf_errors += len(aligned_contigs_fpaths) - len(ref_lengths)
    logger._num_nf_errors = num_nf_errors
    if not ref_lengths:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' +
                       'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) +
                       ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('=' * 120 + '\n')

    for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\
            in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_features_in_contigs[contigs_fpath] = features_in_contigs
        files_unsorted_features_in_contigs[contigs_fpath] = unsorted_features_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        files_unsorted_operons_in_contigs[contigs_fpath] = unsorted_operons_in_contigs
        full_found_genes.append(sum(features_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], report.get_field(reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count))

        genome_mapped.append(float(report.get_field(reporting.Fields.MAPPEDGENOME)))

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if ref_genes_num:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num)
        if ref_operons_num:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        from quast_libs.ca_utils.misc import contigs_aligned_lengths
        if ref_genes_num:
            plotter.genes_operons_plot(ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs,
                genome_stats_dirpath + '/features_cumulative_plot', 'genomic features')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs,
                             genome_stats_dirpath + '/features_frcurve_plot', 'genomic features')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram',
                '# complete genomic features')
        if ref_operons_num:
            plotter.genes_operons_plot(ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs,
                             genome_stats_dirpath + '/operons_frcurve_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.main_info('Done.')
    return containers
Пример #55
0
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path,
                         err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(main_ref_fpath)

    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam')
    sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted'))
    bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted'))

    bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed')
    cov_fpath = os.path.join(res_path, ref_name + '.cov')
    physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov')

    if qconfig.no_sv:
        logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
        bed_fpath = None
    elif is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
    if qconfig.create_icarus_html:
        if is_non_empty_file(cov_fpath):
            is_correct_file = check_cov_file(cov_fpath)
            if is_correct_file:
                logger.info('  Using existing reads coverage file: ' + cov_fpath)
        if is_non_empty_file(physical_cov_fpath):
            logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
    else:
        logger.info('  Will not calculate coverage (--no-icarus or --space-efficient is specified)')
        cov_fpath = None
        physical_cov_fpath = None
    if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
            (qconfig.space_efficient or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
        return bed_fpath, cov_fpath, physical_cov_fpath

    logger.info('  ' + 'Pre-processing reads...')
    logger.info('  ' + 'Logging to %s...' % err_path)
    correct_chr_names = None
    if is_non_empty_file(sam_fpath):
        logger.info('  Using existing SAM-file: ' + sam_fpath)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths)
    elif is_non_empty_file(bam_fpath):
        logger.info('  Using existing BAM-file: ' + bam_fpath)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath],
                               stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths)
    if not correct_chr_names and reads_fpaths:
        logger.info('  Running BWA...')
        # use absolute paths because we will change workdir
        sam_fpath = os.path.abspath(sam_fpath)
        abs_reads_fpaths = []
        for reads_fpath in reads_fpaths:
            abs_reads_fpaths.append(os.path.abspath(reads_fpath))

        if len(abs_reads_fpaths) != 2:
            logger.error('  You should specify files with forward and reverse reads.')
            logger.info('  Failed searching structural variations.')
            return None, None, None

        if not qconfig.no_check:
            if not paired_reads_names_are_equal(reads_fpaths, logger):
                logger.error('  Read names are discordant, skipping reads analysis!')
                logger.info('  Failed searching structural variations.')
                return None, None, None

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath]
        if os.path.getsize(main_ref_fpath) > 2 * 1024 ** 3:  # if reference size bigger than 2GB
            cmd += ['-a', 'bwtsw']
        qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger)

        cmd = bwa_fpath('bwa') + ' mem -t ' + str(qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[0] + ' ' + abs_reads_fpaths[1]

        qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        logger.info('  Done.')
        os.chdir(prev_dir)
        if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0:
            logger.error('  Failed running BWA for the reference. See ' + log_path + ' for information.')
            logger.info('  Failed searching structural variations.')
            return None, None, None
    elif not correct_chr_names:
        logger.info('  Failed searching structural variations.')
        return None, None, None
    logger.info('  Sorting SAM-file...')
    if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath)) and is_non_empty_file(bam_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct')  # write in output dir
        clean_read_names(sam_fpath, correct_sam_fpath)
        bam_fpath = os.path.join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'not unmapped',  '-S', correct_sam_fpath],
                                stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath,
                                bam_fpath], stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath],
                                stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)

    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_name_length = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_name_length[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        if meta_ref_fpaths:
            ref_files = {}
            for cur_ref_fpath in meta_ref_fpaths:
                ref = qutils.name_from_fpath(cur_ref_fpath)
                new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
                if is_non_empty_file(new_ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath))
                    ref_files[ref] = None
                else:
                    new_ref_sam_file = open(new_ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        new_ref_sam_file.write(headers[0] + '\n')
                    chrs = []
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == ref:
                            new_ref_sam_file.write(h + '\n')
                            chrs.append(seq_name)
                    new_ref_sam_file.write(headers[-1] + '\n')
                    ref_files[ref] = new_ref_sam_file
                    need_ref_splitting = True
        deletions = []
        trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname)
        logger.info('  Looking for trivial deletions (long zero-covered fragments)...')
        need_trivial_deletions = True
        if os.path.exists(trivial_deletions_fpath):
            need_trivial_deletions = False
            logger.info('    Using existing file: ' + trivial_deletions_fpath)

        if need_trivial_deletions or need_ref_splitting:
            with open(sam_sorted_fpath) as sam_file:
                cur_deletion = None
                for line in sam_file:
                    mapping = Mapping.parse(line)
                    if mapping:
                        if mapping.ref == '*':
                            continue
                        # common case: continue current deletion (potential) on the same reference
                        if cur_deletion and cur_deletion.ref == mapping.ref:
                            if cur_deletion.next_bad is None:  # previous mapping was in region BEFORE 0-covered fragment
                                # just passed 0-covered fragment
                                if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP:
                                    cur_deletion.set_next_bad(mapping)
                                    if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                        cur_deletion.set_next_good(mapping)
                                        if cur_deletion.is_valid():
                                            deletions.append(cur_deletion)
                                        cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping)
                                # continue region BEFORE 0-covered fragment
                                elif mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_prev_good(mapping)
                                else:
                                    cur_deletion.set_prev_bad(mapping)
                            else:  # previous mapping was in region AFTER 0-covered fragment
                                # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping
                                if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP:
                                    if cur_deletion.is_valid():   # add previous fragment's deletion if needed
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end)
                                # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above)
                                if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_next_good(mapping)
                                    if cur_deletion.is_valid():
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping)
                                else:
                                    cur_deletion.set_next_bad_end(mapping)
                        # special case: just started or just switched to the next reference
                        else:
                            if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                                cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref])
                                if cur_deletion.is_valid():
                                    deletions.append(cur_deletion)
                            cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping)

                        if need_ref_splitting:
                            cur_ref = ref_labels[mapping.ref]
                            if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]:
                                if ref_files[cur_ref] is not None:
                                    ref_files[cur_ref].write(line)
                if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                    cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref])
                    if cur_deletion.is_valid():
                        deletions.append(cur_deletion)
            if need_ref_splitting:
                for ref_handler in ref_files.values():
                    if ref_handler is not None:
                        ref_handler.close()
            if need_trivial_deletions:
                logger.info('  Trivial deletions: %d found' % len(deletions))
                logger.info('    Saving to: ' + trivial_deletions_fpath)
                with open(trivial_deletions_fpath, 'w') as f:
                    for deletion in deletions:
                        f.write(str(deletion) + '\n')

        if isfile(config_manta_fpath):
            try:
                manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path)
                qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if os.path.exists(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Пример #56
0
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes,
                            output_dir, logger):
    istranslocations_by_asm = [
        result['istranslocations_by_refs'] if result else None
        for result in results
    ]
    misassemblies_by_asm = [
        result['misassemblies_by_ref'] if result else None
        for result in results
    ]
    all_refs = []
    for ref in ref_labels_by_chromosomes.values():
        if ref not in all_refs:
            all_refs.append(ref)
    if not qconfig.use_input_ref_order:
        all_refs.sort()
    misassemblies_by_refs_rows = []
    row = {'metricName': 'References', 'values': all_refs}
    misassemblies_by_refs_rows.append(row)
    if not istranslocations_by_asm:
        return
    for i, fpath in enumerate(contigs_fpaths):
        label = qutils.label_from_fpath(fpath)
        row = {'metricName': label, 'values': []}
        misassemblies_by_refs_rows.append(row)
        istranslocations_by_ref = istranslocations_by_asm[i]
        intergenomic_misassemblies_by_asm[label] = defaultdict(list)
        for ref in all_refs:
            intergenomic_misassemblies_by_asm[label][
                ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[
                    i] else []
        if istranslocations_by_ref:
            assembly_name = qutils.name_from_fpath(fpath)
            all_rows = []
            row = {
                'metricName': 'References',
                'values': [ref_num + 1 for ref_num in range(len(all_refs))]
            }
            all_rows.append(row)
            for ref in all_refs:
                row = {'metricName': ref, 'values': []}
                for second_ref in all_refs:
                    if ref == second_ref or second_ref not in istranslocations_by_ref:
                        row['values'].append(None)
                    else:
                        row['values'].append(
                            istranslocations_by_ref[ref][second_ref])
                possible_misassemblies = 0
                misassemblies_by_ref = misassemblies_by_asm[i]
                if misassemblies_by_ref:
                    possible_misassemblies = misassemblies_by_ref[ref].count(
                        Misassembly.POSSIBLE_MISASSEMBLIES)
                istranslocations = max(0, sum([r for r in row['values'] if r]))
                misassemblies_by_refs_rows[-1]['values'].append(
                    istranslocations + possible_misassemblies)
                all_rows.append(row)
            misassembly_by_ref_fpath = os.path.join(
                output_dir,
                'interspecies_translocations_by_refs_%s.info' % assembly_name)
            with open(misassembly_by_ref_fpath,
                      'w') as misassembly_by_ref_file:
                misassembly_by_ref_file.write(
                    'Number of interspecies translocations by references: \n')
            print_file(all_rows,
                       misassembly_by_ref_fpath,
                       append_to_existing_file=True)

            with open(misassembly_by_ref_fpath,
                      'a') as misassembly_by_ref_file:
                misassembly_by_ref_file.write('References:\n')
                for ref_num, ref in enumerate(all_refs):
                    misassembly_by_ref_file.write(
                        str(ref_num + 1) + ' - ' + ref + '\n')
            logger.info(
                '  Information about interspecies translocations by references for %s is saved to %s'
                % (assembly_name, misassembly_by_ref_fpath))
    misassemblies = []
    if qconfig.draw_plots:
        from quast_libs import plotter

        aligned_contigs_labels = []
        for row in misassemblies_by_refs_rows[1:]:
            if row['values']:
                aligned_contigs_labels.append(row['metricName'])
            else:
                misassemblies_by_refs_rows.remove(row)
        for i in range(len(all_refs)):
            cur_results = []
            for row in misassemblies_by_refs_rows[1:]:
                if row['values']:
                    cur_results.append(row['values'][i])
            misassemblies.append(cur_results)
        is_translocations_plot_fpath = os.path.join(
            output_dir, 'intergenomic_misassemblies')
        plotter.draw_meta_summary_plot(
            '',
            output_dir,
            aligned_contigs_labels,
            all_refs,
            misassemblies,
            is_translocations_plot_fpath,
            title='Intergenomic misassemblies (found and supposed)',
            reverse=False,
            yaxis_title=None,
            print_all_refs=True,
            logger=logger)
Пример #57
0
def save_result(result, report, fname, ref_fpath):
    region_misassemblies = result['region_misassemblies']
    misassemblies_by_ref = result['misassemblies_by_ref']
    region_struct_variations = result['region_struct_variations']
    misassemblies_matched_sv = result['misassemblies_matched_sv']
    misassembled_contigs = result['misassembled_contigs']
    misassembled_bases = result['misassembled_bases']
    misassembly_internal_overlap = result['misassembly_internal_overlap']
    unaligned = result['unaligned']
    partially_unaligned = result['partially_unaligned']
    partially_unaligned_bases = result['partially_unaligned_bases']
    fully_unaligned_bases = result['fully_unaligned_bases']
    ambiguous_contigs = result['ambiguous_contigs']
    ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases']
    SNPs = result['SNPs']
    indels_list = result['indels_list']
    total_aligned_bases = result['total_aligned_bases']
    half_unaligned_with_misassembly = result['half_unaligned_with_misassembly']

    report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL))
    report.add_field(reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) +
                     region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
                     region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases)
    report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap)
    if qconfig.bed:
        report.add_field(reporting.Fields.STRUCT_VARIATIONS, misassemblies_matched_sv)
    report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned))
    report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases))
    report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs)
    report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases)
    report.add_field(reporting.Fields.MISMATCHES, SNPs)
    # different types of indels:
    if indels_list is not None:
        report.add_field(reporting.Fields.INDELS, len(indels_list))
        report.add_field(reporting.Fields.INDELSBASES, sum(indels_list))
        report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD]))
        report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD]))

    if total_aligned_bases:
        report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases)))
        report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS))
                                                                 * 100000.0 / float(total_aligned_bases)))

    # for misassemblies report:
    report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) +
                     region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) +
                     region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
    report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION))
    report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION))
    report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs))
    report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases)
    report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL))
    if qconfig.is_combined_ref:
        report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
        report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        all_references = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()])))
        for ref_name in all_references:
            subreport = reporting.get(fname, ref_name=ref_name)
            ref_misassemblies = misassemblies_by_ref[ref_name]
            subreport.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) +
                                ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) +
                                ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION))
            subreport.add_field(reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION))
            subreport.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
            subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL))
            subreport.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
            subreport.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
            if fname not in qconfig.dict_of_broken_scaffolds:
                subreport.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP))
            if qconfig.check_for_fragmented_ref:
                subreport.add_field(reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED))
    elif intergenomic_misassemblies_by_asm:
        label = qutils.label_from_fpath(fname)
        ref_name = qutils.name_from_fpath(ref_fpath)
        ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name]
        report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION))
        report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES))
        report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS))
    if fname not in qconfig.dict_of_broken_scaffolds:
        report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP))
    if qconfig.check_for_fragmented_ref:
        report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED))
    # for unaligned report:
    report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned)
    report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned)
    report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases)
    report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly)
    return report
Пример #58
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Generating Upper Bound Assembly...")

    if not reads_analyzer.compile_reads_analyzer_tools(logger):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...'
        )
        return None

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly on this platform '
            '(only linux64 and macOS are supported), skipping...')
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to install/download third-party repeat finding tool [Red]), skipping...'
        )
        return None

    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    if long_reads:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, mp_polished_suffix)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)
    already_done_fpath = check_prepared_optimal_assembly(
        insert_size, result_fpath, ref_prepared_optimal_assembly)
    if already_done_fpath:
        return already_done_fpath

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)

    if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size:
        calculated_insert_size = qconfig.optimal_assembly_insert_size
        result_fpath = result_fpath.replace('is' + str(insert_size),
                                            'is' + str(calculated_insert_size))
        prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace(
            'is' + str(insert_size), 'is' + str(calculated_insert_size))
        insert_size = calculated_insert_size
        ref_prepared_optimal_assembly = os.path.join(
            os.path.dirname(original_ref_fpath),
            prepared_optimal_assembly_basename)
        already_done_fpath = check_prepared_optimal_assembly(
            insert_size, result_fpath, ref_prepared_optimal_assembly)
        if already_done_fpath:
            return already_done_fpath

    log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath,
        tmp_dir,
        log_fpath,
        binary_fpath,
        insert_size,
        uncovered_fpath,
        use_long_reads=long_reads)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Upper Bound Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_bed(
            uncovered_fpath) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretical Upper Bound Assembly is saved to ' +
                result_fpath)
    logger.notice(
        '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n'
        '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). '
        'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n'
        '\t\tOR\n'
        '\tYou can copy ' + result_fpath + ' to ' +
        ref_prepared_optimal_assembly + '. '
        'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference ('
        + original_ref_fpath + ') and '
        'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size '
        + str(insert_size) + '), '
        'QUAST will reuse this Upper Bound Assembly.\n')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
Пример #59
0
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger):
    istranslocations_by_asm = [result['istranslocations_by_refs'] if result else None for result in results]
    misassemblies_by_asm = [result['misassemblies_by_ref'] if result else None for result in results]
    all_refs = []
    for ref in ref_labels_by_chromosomes.values():
        if ref not in all_refs:
            all_refs.append(ref)
    if not qconfig.use_input_ref_order:
        all_refs.sort()
    misassemblies_by_refs_rows = []
    row = {'metricName': 'References', 'values': all_refs}
    misassemblies_by_refs_rows.append(row)
    if not istranslocations_by_asm:
        return
    for i, fpath in enumerate(contigs_fpaths):
        label = qutils.label_from_fpath(fpath)
        row = {'metricName': label, 'values': []}
        misassemblies_by_refs_rows.append(row)
        istranslocations_by_ref = istranslocations_by_asm[i]
        intergenomic_misassemblies_by_asm[label] = defaultdict(list)
        for ref in all_refs:
            intergenomic_misassemblies_by_asm[label][ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[i] else []
        if istranslocations_by_ref:
            assembly_name = qutils.name_from_fpath(fpath)
            all_rows = []
            row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]}
            all_rows.append(row)
            for ref in all_refs:
                row = {'metricName': ref, 'values': []}
                for second_ref in all_refs:
                    if ref == second_ref or second_ref not in istranslocations_by_ref:
                        row['values'].append(None)
                    else:
                        row['values'].append(istranslocations_by_ref[ref][second_ref])
                possible_misassemblies = 0
                misassemblies_by_ref = misassemblies_by_asm[i]
                if misassemblies_by_ref:
                    possible_misassemblies = misassemblies_by_ref[ref].count(Misassembly.POSSIBLE_MISASSEMBLIES)
                istranslocations = max(0, sum([r for r in row['values'] if r]))
                misassemblies_by_refs_rows[-1]['values'].append(istranslocations + possible_misassemblies)
                all_rows.append(row)
            misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name)
            with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file:
                misassembly_by_ref_file.write('Number of interspecies translocations by references: \n')
            print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True)

            with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file:
                misassembly_by_ref_file.write('References:\n')
                for ref_num, ref in enumerate(all_refs):
                    misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n')
            logger.info('  Information about interspecies translocations by references for %s is saved to %s' %
                        (assembly_name, misassembly_by_ref_fpath))
    misassemblies = []
    if qconfig.draw_plots:
        from quast_libs import plotter

        aligned_contigs_labels = []
        for row in misassemblies_by_refs_rows[1:]:
            if row['values']:
                aligned_contigs_labels.append(row['metricName'])
            else:
                misassemblies_by_refs_rows.remove(row)
        for i in range(len(all_refs)):
            cur_results = []
            for row in misassemblies_by_refs_rows[1:]:
                if row['values']:
                    cur_results.append(row['values'][i])
            misassemblies.append(cur_results)
        is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies')
        plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs,
                                       misassemblies, is_translocations_plot_fpath,
                                       title='Intergenomic misassemblies (found and supposed)', reverse=False,
                                       yaxis_title=None, print_all_refs=True, logger=logger)
Пример #60
0
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath):
    from quast_libs import reporting

    ref_reads_stats = None
    ref_lap_score = None
    if ref_fpath:
        ref_name = qutils.name_from_fpath(ref_fpath)
        stats_fpath = join(output_dir, ref_name + '.stat')
        if isfile(stats_fpath):
            ref_reads_stats = parse_reads_stats(stats_fpath)
            if int(ref_reads_stats['mapped']) == 0:
                logger.info('  BWA: nothing aligned for reference.')
        lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out')
        if is_non_empty_file(lap_out_fpath):
            with open(lap_out_fpath) as f:
                l = f.readline()
                ref_lap_score = float(l.split()[0]) if l else None

    # process all contigs files
    for index, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        stats_fpath = join(output_dir, assembly_name + '.stat')
        if ref_reads_stats:
            report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped'])
            report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt'])
            report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons'])
            report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt'])
            report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth'])
            if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
                report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS,
                                [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
                report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0])
        if not isfile(stats_fpath):
            continue
        reads_stats = parse_reads_stats(stats_fpath)
        report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total'])
        report.add_field(reporting.Fields.LEFT_READS, reads_stats['left'])
        report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right'])
        report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped'])
        report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt'])
        if int(reads_stats['mapped']) == 0:
            logger.info('  ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.')
        report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons'])
        report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt'])
        report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint'])
        report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt'])
        report.add_field(reporting.Fields.DEPTH, reads_stats['depth'])
        if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
            report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS,
                            [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
            report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])

        lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out')
        if is_non_empty_file(lap_out_fpath):
            with open(lap_out_fpath) as f:
                l = f.readline()
                lap_score = float(l.split()[0]) if l else None
            report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None))
        report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))