示例#1
0
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks):
    assemblies_n50 = defaultdict(dict)
    assemblies_data = ''
    assemblies_data += 'var assemblies_links = {};\n'
    assemblies_data += 'var assemblies_len = {};\n'
    assemblies_data += 'var assemblies_contigs = {};\n'
    assemblies_data += 'var assemblies_misassemblies = {};\n'
    assemblies_data += 'var assemblies_n50 = {};\n'
    assemblies_contig_size_data = ''
    for contigs_fpath in contigs_fpaths:
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        report = reporting.get(contigs_fpath)
        l = report.get_field(reporting.Fields.TOTALLEN)
        contigs = report.get_field(reporting.Fields.CONTIGS)
        n50 = report.get_field(reporting.Fields.N50)
        if stdout_pattern:
            contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout'
            contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath)
            assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n'
        assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n'
        assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n'
        assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n'
        for nx in nx_marks:
            assemblies_n50[assembly_label][nx] = report.get_field(nx)
    return assemblies_data, assemblies_contig_size_data, assemblies_n50
示例#2
0
文件: circos.py 项目: ablab/quast
def create_labels(chr_lengths, assemblies, features_containers, coverage_fpath, output_dir):
    labels_txt_fpath = join(output_dir, 'labels.txt')
    track_labels = []
    plot_idx = 0
    for i, assembly in enumerate(assemblies):
        track_labels.append(('assembly' + str(i + 1), plot_idx))
        plot_idx += 1

    for feature_container in features_containers:
        if len(feature_container.region_list) > 0:
            track_labels.append((feature_container.kind, plot_idx))
            plot_idx += 1
    if coverage_fpath:
        track_labels.append(('coverage', plot_idx))
    with open(labels_txt_fpath, 'w') as out_f:
        out_f.write(list(chr_lengths.keys())[0] + '\t0\t0\tnull\t' + ','.join(['track%d=%s' % (i, label) for label, i in track_labels]))
    labels_conf_fpath = join(output_dir, 'label.conf')
    with open(labels_conf_fpath, 'w') as out_f:
        out_f.write('z = 10\n'
                    'type = text\n'
                    'label_size = 30p\n'
                    'label_font = bold\n'
                    'label_parallel = yes\n'
                    'file = ' + relpath(labels_txt_fpath, output_dir) + '\n'
                    'r0 = eval(sprintf("%fr+5p", conf(conf(., track_idx)_pos)))\n'
                    'r1 = eval(sprintf("%fr+500p", conf(conf(., track_idx)_pos)))\n'
                    '<rules>\n'
                    '<rule>\n'
                    'condition = 1\n'
                    'value = eval(var(conf(., track_idx)))\n'
                    '</rule>\n'
                    '</rules>\n')
    return labels_conf_fpath, track_labels
示例#3
0
def create_labels(chr_lengths, assemblies, features_containers, coverage_fpath, output_dir):
    labels_txt_fpath = join(output_dir, 'labels.txt')
    track_labels = []
    plot_idx = 0
    for i, assembly in enumerate(assemblies):
        track_labels.append(('assembly' + str(i + 1), plot_idx))
        plot_idx += 1

    for feature_container in features_containers:
        if len(feature_container.region_list) > 0:
            track_labels.append((feature_container.kind, plot_idx))
            plot_idx += 1
    if coverage_fpath:
        track_labels.append(('coverage', plot_idx))
    with open(labels_txt_fpath, 'w') as out_f:
        out_f.write(list(chr_lengths.keys())[0] + '\t0\t0\tnull\t' + ','.join(['track%d=%s' % (i, label) for label, i in track_labels]))
    labels_conf_fpath = join(output_dir, 'label.conf')
    with open(labels_conf_fpath, 'w') as out_f:
        out_f.write('z = 10\n'
                    'type = text\n'
                    'label_size = 30p\n'
                    'label_font = bold\n'
                    'label_parallel = yes\n'
                    'file = ' + relpath(labels_txt_fpath, output_dir) + '\n'
                    'r0 = eval(sprintf("%fr+5p", conf(conf(., track_idx)_pos)))\n'
                    'r1 = eval(sprintf("%fr+500p", conf(conf(., track_idx)_pos)))\n'
                    '<rules>\n'
                    '<rule>\n'
                    'condition = 1\n'
                    'value = eval(var(conf(., track_idx)))\n'
                    '</rule>\n'
                    '</rules>\n')
    return labels_conf_fpath, track_labels
示例#4
0
def create_housekeeping_file(chr_lengths, max_points, root_dir, output_dir, logger):
    max_ideograms = len(chr_lengths.keys())
    template_fpath = None
    circos_bin_fpath = get_path_to_program('circos')
    if circos_bin_fpath:
        circos_dirpath = dirname(realpath(get_path_to_program('circos')))
        template_fpath = join(circos_dirpath, '..', 'libexec', 'etc', 'housekeeping.conf')
        if not is_non_empty_file(template_fpath):
            template_fpath = join(circos_dirpath, '..', 'etc', 'housekeeping.conf')

    if not is_non_empty_file(template_fpath):
        if not get_path_to_program('circos'):
            msg = 'Circos is not found.'
        else:
            msg = 'File etc/housekeeping.conf is not found.'
        logger.warning(msg + ' You will have to manually edit etc/housekeeping.conf: '
                       'set max_points_per_track to ' + str(max_points) + ' and max_ideograms to ' + str(max_ideograms))
        return '<<include %s>>\n' % join('etc', 'housekeeping.conf')

    housekeeping_fpath = join(output_dir, 'housekeeping.conf')
    with open(template_fpath) as f:
        with open(housekeeping_fpath, 'w') as out_f:
            for line in f:
                if 'max_points_per_track' in line:
                    out_f.write('max_points_per_track = %d\n' % max_points)
                elif 'max_ideograms' in line:
                    out_f.write('max_ideograms = %d\n' % max_ideograms)
                else:
                    out_f.write(line)
    return '<<include %s>>\n' % relpath(housekeeping_fpath, root_dir)
示例#5
0
文件: circos.py 项目: student-t/quast
def create_housekeeping_file(chr_lengths, max_points, root_dir, output_dir, logger):
    max_ideograms = len(chr_lengths.keys())
    template_fpath = None
    circos_bin_fpath = get_path_to_program('circos')
    if circos_bin_fpath:
        circos_dirpath = dirname(realpath(get_path_to_program('circos')))
        template_fpath = join(circos_dirpath, '..', 'libexec', 'etc', 'housekeeping.conf')
        if not is_non_empty_file(template_fpath):
            template_fpath = join(circos_dirpath, '..', 'etc', 'housekeeping.conf')

    if not is_non_empty_file(template_fpath):
        if not get_path_to_program('circos'):
            msg = 'Circos is not found.'
        else:
            msg = 'File etc/housekeeping.conf is not found.'
        logger.warning(msg + ' You will have to manually edit etc/housekeeping.conf: '
                       'set max_points_per_track to ' + str(max_points) + ' and max_ideograms to ' + str(max_ideograms))
        return '<<include %s>>\n' % join('etc', 'housekeeping.conf')

    housekeeping_fpath = join(output_dir, 'housekeeping.conf')
    with open(template_fpath) as f:
        with open(housekeeping_fpath, 'w') as out_f:
            for line in f:
                if 'max_points_per_track' in line:
                    out_f.write('max_points_per_track = %d\n' % max_points)
                elif 'max_ideograms' in line:
                    out_f.write('max_ideograms = %d\n' % max_ideograms)
                else:
                    out_f.write(line)
    return '<<include %s>>\n' % relpath(housekeeping_fpath, root_dir)
示例#6
0
from quast_libs.qutils import compile_tool, check_prev_compilation_failed

bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa')
sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba')
bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools')
bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin')
manta_dirpath = join(qconfig.LIBS_LOCATION, 'manta')
manta_build_dirpath = join(qconfig.LIBS_LOCATION, 'manta', 'build')
manta_bin_dirpath = join(qconfig.LIBS_LOCATION, 'manta', 'build', 'bin')
config_manta_fpath = join(manta_bin_dirpath, 'configManta.py')

manta_external_dirpath = join(qconfig.QUAST_HOME, 'external_tools/manta')
manta_ext_linux_fpath = join(manta_external_dirpath, 'manta_linux.tar.bz2')
manta_ext_osx_fpath = join(manta_external_dirpath, 'manta_osx.tar.bz2')

manta_linux_url = qconfig.GIT_ROOT_URL + qutils.relpath(manta_ext_linux_fpath, qconfig.QUAST_HOME)
manta_osx_url = qconfig.GIT_ROOT_URL + qutils.relpath(manta_ext_osx_fpath, qconfig.QUAST_HOME)


def bwa_fpath(fname):
    return join(bwa_dirpath, fname)


def sambamba_fpath(fname):
    platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux'
    return join(sambamba_dirpath, fname + platform_suffix)


def bedtools_fpath(fname):
    return join(bedtools_bin_dirpath, fname)
示例#7
0
from quast_libs.qutils import compile_tool, get_dir_for_download, relpath, get_path_to_program, download_file, \
    download_external_tool, is_non_empty_file, correct_name, get_free_memory

bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa')

bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools')
bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin')
sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba')

gridss_dirpath = None
gridss_version = '1.4.1'
gridss_fname = 'gridss-' + gridss_version + '.jar'

gridss_external_fpath = join(qconfig.QUAST_HOME, 'external_tools/gridss',
                             gridss_fname)
gridss_url = qconfig.GIT_ROOT_URL + relpath(gridss_external_fpath,
                                            qconfig.QUAST_HOME)


def bwa_fpath(fname):
    return get_path_to_program(fname, bwa_dirpath)


def sambamba_fpath(fname):
    platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux'
    return join(sambamba_dirpath, fname + platform_suffix)


def bedtools_fpath(fname):
    return get_path_to_program(fname, bedtools_bin_dirpath)

示例#8
0
from quast_libs.qutils import compile_tool, check_prev_compilation_failed

bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa')
sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba')
bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools')
bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin')
manta_dirpath = join(qconfig.LIBS_LOCATION, 'manta')
manta_build_dirpath = join(qconfig.LIBS_LOCATION, 'manta', 'build')
manta_bin_dirpath = join(qconfig.LIBS_LOCATION, 'manta', 'build', 'bin')
config_manta_fpath = join(manta_bin_dirpath, 'configManta.py')

manta_external_dirpath = join(qconfig.QUAST_HOME, 'external_tools/manta')
manta_ext_linux_fpath = join(manta_external_dirpath, 'manta_linux.tar.bz2')
manta_ext_osx_fpath = join(manta_external_dirpath, 'manta_osx.tar.bz2')

manta_linux_url = qconfig.GIT_ROOT_URL + qutils.relpath(manta_ext_linux_fpath, qconfig.QUAST_HOME)
manta_osx_url = qconfig.GIT_ROOT_URL + qutils.relpath(manta_ext_osx_fpath, qconfig.QUAST_HOME)


def bwa_fpath(fname):
    return join(bwa_dirpath, fname)


def sambamba_fpath(fname):
    platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux'
    return join(sambamba_dirpath, fname + platform_suffix)


def bedtools_fpath(fname):
    return join(bedtools_bin_dirpath, fname)
示例#9
0
def find_package_files(dirpath, package=quast_package):
    paths = []
    for (path, dirs, fnames) in os.walk(join(package, dirpath)):
        for fname in fnames:
            paths.append(qutils.relpath(join(path, fname), package))
    return paths
示例#10
0
    import urllib.request as urllib

import xml.etree.ElementTree as ET
import socket
socket.setdefaulttimeout(120)

silva_pattern = re.compile(r'\S+\_(?P<taxons>\S+);(?P<seqname>\S+)', re.I)
ncbi_pattern = re.compile(r'(?P<id>\S+\_[0-9.]+)[_ |](?P<seqname>\S+)', re.I)

silva_db_url = 'http://www.arb-silva.de/fileadmin/silva_databases/release_123/Exports/'
silva_fname = 'SILVA_123_SSURef_Nr99_tax_silva.fasta'

external_tools_dirpath = join(qconfig.QUAST_HOME, 'external_tools')
blast_external_tools_dirpath = join(external_tools_dirpath, 'blast', qconfig.platform_name)
blast_filenames = ['makeblastdb', 'blastn']
blast_dirpath_url = qconfig.GIT_ROOT_URL + qutils.relpath(blast_external_tools_dirpath, qconfig.QUAST_HOME)

blast_dirpath = join(qconfig.LIBS_LOCATION, 'blast')
blastdb_dirpath = join(qconfig.LIBS_LOCATION, 'blast', '16S_RNA_blastdb')
db_fpath = join(blastdb_dirpath, 'silva.db')
db_nsq_fsize = 194318557

is_quast_first_run = False
taxons_for_krona = {}
connection_errors = 0


def get_blast_fpath(fname):
    blast_path = os.path.join(blast_dirpath, fname)
    if os.path.exists(blast_path):
        return blast_path
示例#11
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases}
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
示例#12
0
def find_package_files(dirpath, package=quast_package):
    paths = []
    for (path, dirs, fnames) in os.walk(join(package, dirpath)):
        for fname in fnames:
            paths.append(qutils.relpath(join(path, fname), package))
    return paths
示例#13
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    references = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        references[name] = seq
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    ref_lens = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq in references.items():
        regions.setdefault(name, []).append([1, len(seq)])
        ref_lens[name] = len(seq)
        total_regions += 1
        total_reg_len += ref_lens[name]
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
示例#14
0
文件: misc.py 项目: student-t/quast
from quast_libs.fastaparser import get_chr_lengths_from_fastafile
from quast_libs.qutils import compile_tool, get_dir_for_download, relpath, get_path_to_program, download_file, \
    download_external_tool, is_non_empty_file, correct_name, get_total_memory

bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa')
bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools')
bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin')
lap_dirpath = join(qconfig.LIBS_LOCATION, 'LAP')
sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba')

gridss_dirpath = None
gridss_version = '1.4.1'
gridss_fname = 'gridss-' + gridss_version + '.jar'

gridss_external_fpath = join(qconfig.QUAST_HOME, 'external_tools/gridss', gridss_fname)
gridss_url = qconfig.GIT_ROOT_URL + relpath(gridss_external_fpath, qconfig.QUAST_HOME)


def bwa_fpath(fname):
    return get_path_to_program(fname, bwa_dirpath)


def sambamba_fpath(fname):
    platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux'
    return join(sambamba_dirpath, fname + platform_suffix)


def bedtools_fpath(fname):
    return get_path_to_program(fname, bedtools_bin_dirpath)

示例#15
0
文件: circos.py 项目: student-t/quast
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger):
    data_dir = join(output_dir, 'data')
    if not exists(data_dir):
        os.makedirs(data_dir)

    chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir)
    if max_len >= 10 ** 6:
        chrom_units = 10 ** 5
    elif max_len >= 10 ** 5:
        chrom_units = 10 ** 4
    else:
        chrom_units = 1000
    ticks_fpath = create_ticks_conf(chrom_units, data_dir)
    ref_len = sum(chr_lengths.values())
    window_size = set_window_size(ref_len)

    assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern)
    alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies]
    if not alignments_fpaths:
        return None

    gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir)
    feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir)
    mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies]
    cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, ref_len, data_dir)
    max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points])
    labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir)

    conf_fpath = join(output_dir, 'circos.conf')
    radius = 0.95
    plot_idx = 0
    track_intervals = [TRACK_INTERVAL] * len(assemblies)
    if feature_fpaths:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals += [TRACK_INTERVAL] * len(feature_fpaths)
    if cov_data_fpath:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals.append(TRACK_INTERVAL)
    track_intervals[-1] = BIG_TRACK_INTERVAL
    with open(conf_fpath, 'w') as out_f:
        out_f.write('<<include etc/colors_fonts_patterns.conf>>\n')
        out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir))
        out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir))
        out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir))
        out_f.write('chromosomes_units = %d\n' % chrom_units)
        out_f.write('chromosomes_display_default = yes\n')
        out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n')
        for i in range(len(track_intervals)):
            out_f.write('track%d_pos = %f\n' % (i, radius))
            radius -= TRACK_WIDTH
            radius -= track_intervals[i]
        out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius))
        out_f.write('<image>\n')
        out_f.write('dir = %s\n' % output_dir)
        out_f.write('file = %s\n' % circos_png_fname)
        out_f.write('png = yes\n')
        out_f.write('svg = no\n')
        out_f.write('radius = 1500p\n')
        out_f.write('angle_offset = -90\n')
        out_f.write('auto_alpha_colors = yes\n')
        out_f.write('auto_alpha_steps = 5\n')
        out_f.write('background = white\n')
        out_f.write('</image>\n')
        if qconfig.is_combined_ref:
            out_f.write('<highlights>\n')
            highlights_fpath = create_meta_highlights(chr_lengths, data_dir)
            out_f.write('<highlight>\n')
            out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir))
            out_f.write('r0 = 1r - 50p\n')
            out_f.write('r1 = 1r - 30p\n')
            out_f.write('</highlight>\n')
            out_f.write('</highlights>\n')
        out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger))
        out_f.write('<plots>\n')
        out_f.write('layers_overflow = collapse\n')
        for label, i in track_labels:
            out_f.write('<plot>\n')
            out_f.write('track_idx = track%d\n' % i)
            out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir))
            out_f.write('</plot>\n')
        for i, alignments_conf in enumerate(alignments_fpaths):
            out_f.write('<plot>\n')
            out_f.write('type = tile\n')
            out_f.write('thickness = 50p\n')
            out_f.write('stroke_thickness = 0\n')
            out_f.write('layers = 1\n')
            out_f.write('file = %s\n' % relpath(alignments_conf, output_dir))
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            if mismatches_fpaths and mismatches_fpaths[i]:
                out_f.write('<plot>\n')
                out_f.write('type = histogram\n')
                out_f.write('thickness = 1\n')
                out_f.write('fill_color = vlyellow\n')
                out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir))
                out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
                out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
                out_f.write('</plot>\n')
            plot_idx += 1
        for feature_fpath in feature_fpaths:
            # genes plot
            out_f.write('<plot>\n')
            out_f.write('type = heatmap\n')
            out_f.write('file = %s\n' % relpath(feature_fpath, output_dir))
            out_f.write('color = ylorbr-9\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        if cov_data_fpath:
            # coverage plot
            out_f.write('<plot>\n')
            out_f.write('type = histogram\n')
            out_f.write('thickness = 1\n')
            out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir))
            out_f.write('fill_color = vlblue\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        # GC plot
        out_f.write('<plot>\n')
        out_f.write('type = heatmap\n')
        out_f.write('file = %s\n' % relpath(gc_fpath, output_dir))
        out_f.write('color = greys-6\n')
        out_f.write('scale_log_base = 1.5\n')
        out_f.write('r0 = 1r - 29p\n')
        out_f.write('r1 = 1r - 1p\n')
        out_f.write('</plot>\n')
        out_f.write('</plots>\n')

    circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir)
    return conf_fpath, circos_legend_fpath
示例#16
0
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger):
    data_dir = join(output_dir, 'data')
    if not exists(data_dir):
        os.makedirs(data_dir)

    chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir)
    if max_len >= 10 ** 6:
        chrom_units = 10 ** 5
    elif max_len >= 10 ** 5:
        chrom_units = 10 ** 4
    else:
        chrom_units = 1000
    ticks_fpath = create_ticks_conf(chrom_units, data_dir)
    ref_len = sum(chr_lengths.values())
    window_size = set_window_size(ref_len)

    assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern)
    alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies]
    if not alignments_fpaths:
        return None

    gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir)
    feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir)
    mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies]
    cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, chr_lengths, data_dir)
    max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points])
    labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir)

    conf_fpath = join(output_dir, 'circos.conf')
    radius = 0.95
    plot_idx = 0
    track_intervals = [TRACK_INTERVAL] * len(assemblies)
    if feature_fpaths:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals += [TRACK_INTERVAL] * len(feature_fpaths)
    if cov_data_fpath:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals.append(TRACK_INTERVAL)
    track_intervals[-1] = BIG_TRACK_INTERVAL
    with open(conf_fpath, 'w') as out_f:
        out_f.write('<<include etc/colors_fonts_patterns.conf>>\n')
        out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir))
        out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir))
        out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir))
        out_f.write('chromosomes_units = %d\n' % chrom_units)
        out_f.write('chromosomes_display_default = yes\n')
        out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n')
        for i in range(len(track_intervals)):
            out_f.write('track%d_pos = %f\n' % (i, radius))
            radius -= TRACK_WIDTH
            radius -= track_intervals[i]
        out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius))
        out_f.write('<image>\n')
        out_f.write('dir = %s\n' % output_dir)
        out_f.write('file = %s\n' % circos_png_fname)
        out_f.write('png = yes\n')
        out_f.write('svg = no\n')
        out_f.write('radius = 1500p\n')
        out_f.write('angle_offset = -90\n')
        out_f.write('auto_alpha_colors = yes\n')
        out_f.write('auto_alpha_steps = 5\n')
        out_f.write('background = white\n')
        out_f.write('</image>\n')
        if qconfig.is_combined_ref:
            out_f.write('<highlights>\n')
            highlights_fpath = create_meta_highlights(chr_lengths, data_dir)
            out_f.write('<highlight>\n')
            out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir))
            out_f.write('r0 = 1r - 50p\n')
            out_f.write('r1 = 1r - 30p\n')
            out_f.write('</highlight>\n')
            out_f.write('</highlights>\n')
        out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger))
        out_f.write('<plots>\n')
        out_f.write('layers_overflow = collapse\n')
        for label, i in track_labels:
            out_f.write('<plot>\n')
            out_f.write('track_idx = track%d\n' % i)
            out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir))
            out_f.write('</plot>\n')
        for i, alignments_conf in enumerate(alignments_fpaths):
            out_f.write('<plot>\n')
            out_f.write('type = tile\n')
            out_f.write('thickness = 50p\n')
            out_f.write('stroke_thickness = 0\n')
            out_f.write('layers = 1\n')
            out_f.write('file = %s\n' % relpath(alignments_conf, output_dir))
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            if mismatches_fpaths and mismatches_fpaths[i]:
                out_f.write('<plot>\n')
                out_f.write('type = histogram\n')
                out_f.write('thickness = 1\n')
                out_f.write('fill_color = vlyellow\n')
                out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir))
                out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
                out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
                out_f.write('</plot>\n')
            plot_idx += 1
        for feature_fpath in feature_fpaths:
            # genes plot
            out_f.write('<plot>\n')
            out_f.write('type = heatmap\n')
            out_f.write('file = %s\n' % relpath(feature_fpath, output_dir))
            out_f.write('color = ylorbr-9\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        if cov_data_fpath:
            # coverage plot
            out_f.write('<plot>\n')
            out_f.write('type = histogram\n')
            out_f.write('thickness = 1\n')
            out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir))
            out_f.write('fill_color = vlblue\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        # GC plot
        out_f.write('<plot>\n')
        out_f.write('type = heatmap\n')
        out_f.write('file = %s\n' % relpath(gc_fpath, output_dir))
        out_f.write('color = greys-6\n')
        out_f.write('scale_log_base = 1.5\n')
        out_f.write('r0 = 1r - 29p\n')
        out_f.write('r1 = 1r - 1p\n')
        out_f.write('</plot>\n')
        out_f.write('</plots>\n')

    circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir)
    return conf_fpath, circos_legend_fpath
示例#17
0
def align_and_analyze(is_cyclic,
                      index,
                      contigs_fpath,
                      output_dirpath,
                      ref_fpath,
                      reference_chromosomes,
                      ns_by_chromosomes,
                      old_contigs_fpath,
                      bed_fpath,
                      threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stdout')
        log_err_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stderr')
        icarus_out_fpath = join(
            output_dirpath,
            qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.mis_contigs.info')
        unaligned_info_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = [
        'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous',
        'Best_group'
    ]
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' +
                    log_out_fpath + ' and ' + os.path.basename(log_err_fpath) +
                    '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(
        out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath,
                           contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error(
                    '  ' + qutils.index_to_str(index) +
                    'Failed aligning contigs ' +
                    qutils.label_from_fpath(contigs_fpath) +
                    ' to the reference (non-zero exit code). ' +
                    ('Run with the --debug flag to see additional information.'
                     if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Alignment failed for ' +
                    contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Alignment failed for ' + '\'' + assembly_label +
                            '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Nothing aligned for ' +
                    contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Nothing aligned for ' + '\'' + assembly_label +
                            '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n')  # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f,
                         misassembly_f=misassembly_f,
                         coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(
        ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {
        'SNPs': total_indels_info.mismatches,
        'indels_list': total_indels_info.indels_list,
        'total_aligned_bases': total_aligned_bases
    }
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath,
                           total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq)
                 for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(
            join(output_dirpath,
                 qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'),
            fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(
            output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(
            output_dirpath,
            qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' +
                     qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(
                                r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(
                                    contig)[0][0]
                                contig_cov = len_cov_pattern.findall(
                                    contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' +
                                                           str(aligned_len) +
                                                           '\t' + contig_cov +
                                                           '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
示例#18
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_lens = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        ref_lens[name] = len(seq)
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in ref_lens.items():
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    # if qconfig.large_genome:
    #     log_out_f.write('Analyzing large blocks...\n')
    #     large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null'
    #     ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'),
    #                                coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w'))
    #     min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
    #     result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null',
    #                                   aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0])
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
import xml.etree.ElementTree as ET
import socket
socket.setdefaulttimeout(120)

silva_pattern = re.compile(r'\S+\_(?P<taxons>\S+);(?P<seqname>\S+)', re.I)
ncbi_pattern = re.compile(r'(?P<id>\S+\_[0-9.]+)[_ |](?P<seqname>\S+)', re.I)

silva_db_url = 'http://www.arb-silva.de/fileadmin/silva_databases/release_123/Exports/'
silva_fname = 'SILVA_123_SSURef_Nr99_tax_silva.fasta'
silva_id = '123'
silva_downloaded_fname = 'silva.' + silva_id + '.db'

external_tools_dirpath = join(qconfig.QUAST_HOME, 'external_tools')
blast_external_tools_dirpath = join(external_tools_dirpath, 'blast', qconfig.platform_name)
blast_filenames = ['makeblastdb', 'blastn']
blast_dirpath_url = qconfig.GIT_ROOT_URL + qutils.relpath(blast_external_tools_dirpath, qconfig.QUAST_HOME)

blast_dirpath = None
blastdb_dirpath = None
db_fpath = None
db_nsq_fsize = 194318557

is_quast_first_run = False
taxons_for_krona = {}
connection_errors = 0


def get_blast_fpath(fname):
    if blast_dirpath:
        blast_path = os.path.join(blast_dirpath, fname)
        if os.path.exists(blast_path):