예제 #1
0
파일: qutils.py 프로젝트: rpscruz/CS175PS
def correct_fasta(original_fpath, min_contig, corrected_fpath=None, is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.error('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                if re.compile(r'[^ACGTN]').search(seq):
                    logger.error('File ' + original_fpath + ' contains non-ACGTN characters. '
                                 'Please re-run QUAST without --no-check.', indent='    ', exit_with_code=1)
                    return False
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    if not modified_fasta_entries:
        logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent='    ')
        return False
    if corrected_fpath:
        fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)
    return True
예제 #2
0
파일: qutils.py 프로젝트: student-t/quast
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        total_contigs_for_the_scaf = split_by_ns(seq, name, broken_scaffolds_fasta, qconfig.Ns_break_threshold, qconfig.min_contig)
        contigs_counter += total_contigs_for_the_scaf
    if contigs_counter > scaffold_counter + 1:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
예제 #3
0
def split_file_with_transcripts(transcripts_dict, us_threads, output_dirs):
    f_fa_pathes = []

    file_n = len(transcripts_dict)
    thread_n = file_n // us_threads

    id_transcripts = list(transcripts_dict.keys())
    for i_thread in range(us_threads):
        fpath = os.path.join(output_dirs[i_thread],
                             '{}.fasta'.format(i_thread))
        f_fa_pathes.append(fpath)
        for i_transcript in range(thread_n):
            fastaparser.write_fasta(
                fpath, [(id_transcripts[i_thread * thread_n + i_transcript],
                         transcripts_dict[id_transcripts[i_thread * thread_n +
                                                         i_transcript]])],
                mode='a')

    for i_thread in range(file_n - us_threads * thread_n):
        fastaparser.write_fasta(
            f_fa_pathes[i_thread],
            [(id_transcripts[us_threads * thread_n + i_thread],
              transcripts_dict[id_transcripts[us_threads * thread_n +
                                              i_thread]])],
            mode='a')

    return f_fa_pathes
예제 #4
0
def correct_fasta(original_fpath, min_contig, corrected_fpath=None, is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.error('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    if not modified_fasta_entries:
        logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent='    ')
        return False
    if corrected_fpath:
        fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)
    return True
예제 #5
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        total_contigs_for_the_scaf = split_by_ns(seq, name, broken_scaffolds_fasta, qconfig.Ns_break_threshold, qconfig.min_contig)
        contigs_counter += total_contigs_for_the_scaf
    if contigs_counter > scaffold_counter + 1:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
예제 #6
0
def fill_gaps_mate_pair(bam_fpath, ref_fpath, assembly_fpath, assembly_covered_regions, output_dir, uncovered_fpath, err_fpath):
    matepair_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True)
    final_fasta = []
    matepair_regions = connect_with_matepairs(bam_fpath, output_dir, err_fpath)
    final_assembly_fpath = add_suffix(assembly_fpath, mp_polished_suffix)
    for name, seq in fastaparser.read_fasta(ref_fpath):
        covered_regions = list(find_overlaps(assembly_covered_regions[name], matepair_reads_covered_regions[name], overlap=50))
        total_contigs = 0
        if name not in matepair_regions or len(covered_regions) == 1:
            for region in covered_regions:
                final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), seq[region[0]: region[1]]))
                total_contigs += 1
        else:
            frags_to_merge = [covered_regions.pop(0)]
            sorted_mp_intervals = sorted(matepair_regions[name])
            while covered_regions:
                region2 = covered_regions.pop(0)
                if is_overlapped(frags_to_merge[-1], region2, sorted_mp_intervals):
                    frags_to_merge.append(region2)
                else:
                    merged_seq = merge_fragments_with_ns(seq, frags_to_merge)
                    final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq))
                    total_contigs += 1
                    frags_to_merge = [region2]
            if frags_to_merge:
                merged_seq = merge_fragments_with_ns(seq, frags_to_merge)
                final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq))
                total_contigs += 1
    fastaparser.write_fasta(final_assembly_fpath, final_fasta)
    return final_assembly_fpath
예제 #7
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.warning('Skipping ' + original_fpath + ' because >sequence_name field is empty.',
                    indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = []  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
예제 #8
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    is_broken = False
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        cumul_contig_length = 0
        total_contigs_for_the_scaf = 0
        cur_contig_start = 0
        while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
            start = seq.find("N", cumul_contig_length)
            end = start + 1
            while (end != len(seq)) and (seq[end] == 'N'):
                end += 1

            cumul_contig_length = end + 1
            if end - start >= qconfig.Ns_break_threshold:
                is_broken = True
                if start - cur_contig_start >= qconfig.min_contig:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf + 1),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                cur_contig_start = end

        if len(seq) - cur_contig_start >= qconfig.min_contig:
            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" +
                 str(total_contigs_for_the_scaf + 1),
                 seq[cur_contig_start:]))
            total_contigs_for_the_scaf += 1

        contigs_counter += total_contigs_for_the_scaf
    if is_broken:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
예제 #9
0
파일: metautils.py 프로젝트: nwespe/quast
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath,
                               alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    alignments_fpath = alignments_fpath_template % corr_assembly_label
    if os.path.exists(alignments_fpath):
        with open(alignments_fpath) as f:
            for line in f:
                values = line.split()
                if values[
                        0] in contigs_analyzer.ref_labels_by_chromosomes.keys(
                        ):
                    ref_name = contigs_analyzer.ref_labels_by_chromosomes[
                        values[0]]
                    ref_contigs_names = values[1:]
                    ref_contigs_fpath = os.path.join(
                        corrected_dirpath,
                        corr_assembly_label + '_to_' + ref_name + '.fasta')
                    if ref_name not in aligned_contigs_for_each_ref:
                        aligned_contigs_for_each_ref[ref_name] = []

                    for (cont_name, seq) in contigs_seq:
                        if not cont_name in contigs:
                            contigs[cont_name] = seq

                        if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[
                                ref_name]:
                            # Collecting all aligned contigs names in order to further extract not aligned
                            aligned_contig_names.add(cont_name)
                            aligned_contigs_for_each_ref[ref_name].append(
                                cont_name)
                            fastaparser.write_fasta(ref_contigs_fpath,
                                                    [(cont_name, seq)], 'a')

                    ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                    if ref_asm.name not in added_ref_asm:
                        if ref_name in assemblies_by_ref:
                            assemblies_by_ref[ref_name].append(ref_asm)
                            added_ref_asm.append(ref_asm.name)
        if qconfig.space_efficient:
            os.remove(alignments_fpath)

    # Extraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath,
                            [(name, contigs[name])
                             for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
예제 #10
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)):
        seq_num = str(seq_num)
        ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER]
        contig_path = os.path.join(base_dir, seq_num + '.fasta')
        gff_path = os.path.join(base_dir, seq_num + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')
    out_gff_path = merge_gffs(gffs, out_gff_fpath)
    unique, total = set(), 0
    genes = []
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start - 1:end]
        else:
            gene_seq = rev_comp(contigs[contig][start - 1:end])
        if gene_seq not in unique:
            unique.add(gene_seq)
        gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq)
        gene.is_full = gene.start > 1 and gene.end < len(contigs[contig])
        genes.append(gene)

    full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths]
    partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths]
    if OUTPUT_FASTA:
        out_fasta_fpath = out_fpath + '_genes.fasta'
        add_genes_to_fasta(genes, out_fasta_fpath)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
예제 #11
0
파일: glimmer.py 프로젝트: student-t/quast
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)):
        seq_num = str(seq_num)
        ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER]
        contig_path = os.path.join(base_dir, seq_num + '.fasta')
        gff_path = os.path.join(base_dir, seq_num + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')
    out_gff_path = merge_gffs(gffs, out_gff_fpath)
    unique, total = set(), 0
    genes = []
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start - 1:end]
        else:
            gene_seq = rev_comp(contigs[contig][start - 1:end])
        if gene_seq not in unique:
            unique.add(gene_seq)
        gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq)
        gene.is_full = gene.start > 1 and gene.end < len(contigs[contig])
        genes.append(gene)

    full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths]
    partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths]
    if OUTPUT_FASTA:
        out_fasta_fpath = out_fpath + '_genes.fasta'
        add_genes_to_fasta(genes, out_fasta_fpath)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
예제 #12
0
파일: genemark.py 프로젝트: nwespe/quast
def add_genes_to_fasta(genes, fasta_fpath):
    def inner():
        for i, gene in enumerate(genes):
            length = gene.end - gene.start
            gene_id = '>gene_%d|GeneMark.hmm|%d_nt|%s|%d|%d|%s' % (
                i + 1, length, gene.strand, gene.start, gene.end, gene.contig)
            yield gene_id, gene.seq

    write_fasta(fasta_fpath, inner())
예제 #13
0
    def print_fa_transcripts(self, transcripts_dict, ids_transcripts, path_fasta, logger, transcripts_name=''):
        logger.info('    Getting {} transcripts report...'.format(transcripts_name))

        chosen_transcripts = []
        for id_transcript in ids_transcripts:
            chosen_transcripts.append((id_transcript, transcripts_dict[id_transcript]))

        fastaparser.write_fasta(path_fasta, sorted(chosen_transcripts))

        logger.info('      saved to {}'.format(path_fasta))
예제 #14
0
    def print_unannotated_transcripts(self, transcripts_dict, path_fa_unannotated, logger):
        logger.info('    Getting Unannotated transcripts report...')

        unannotated_transcripts = []
        for id_transcript in self.ids_unannotated_transcripts:
            unannotated_transcripts.append((id_transcript, transcripts_dict[id_transcript]))

        fastaparser.write_fasta(path_fa_unannotated, unannotated_transcripts)

        logger.info('      saved to {}'.format(path_fa_unannotated))
예제 #15
0
def add_genes_to_fasta(genes, fasta_fpath):
    def inner():
        for i, gene in enumerate(genes):
            length = gene.end - gene.start
            gene_id = '>gene_%d|GeneMark.hmm|%d_nt|%s|%d|%d|%s' % (
                i + 1, length, gene.strand, gene.start, gene.end, gene.contig
            )
            yield gene_id, gene.seq

    write_fasta(fasta_fpath, inner())
예제 #16
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        cumul_contig_length = 0
        total_contigs_for_the_scaf = 1
        cur_contig_start = 0
        while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
            start = seq.find("N", cumul_contig_length)
            end = start + 1
            while (end != len(seq)) and (seq[end] == 'N'):
                end += 1

            cumul_contig_length = end + 1
            if (end - start) >= qconfig.Ns_break_threshold:
                broken_scaffolds_fasta.append(
                    (name.split()[0] + "_" +
                     str(total_contigs_for_the_scaf),
                     seq[cur_contig_start:start]))
                total_contigs_for_the_scaf += 1
                cur_contig_start = end

        broken_scaffolds_fasta.append(
            (name.split()[0] + "_" +
             str(total_contigs_for_the_scaf),
             seq[cur_contig_start:]))

        contigs_counter += total_contigs_for_the_scaf
    if scaffold_counter + 1 != contigs_counter:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
            "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
예제 #17
0
def fill_gaps_single(ref_fpath, assembly_fpath, assembly_covered_regions, uncovered_fpath):
    single_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True)
    final_assembly_fpath = add_suffix(assembly_fpath, single_polished_suffix)
    final_fasta = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        covered_regions = find_overlaps(assembly_covered_regions[name], single_reads_covered_regions[name], overlap=50)
        for i, region in enumerate(covered_regions):
            start, end = region
            final_fasta.append((name.split()[0] + "_" + str(i + 1), seq[start: end]))
    fastaparser.write_fasta(final_assembly_fpath, final_fasta)
    return final_assembly_fpath
예제 #18
0
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    alignments_fpath = alignments_fpath_template % corr_assembly_label
    if os.path.exists(alignments_fpath):
        with open(alignments_fpath) as f:
            for line in f:
                values = line.split()
                if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys():
                    ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]]
                    ref_contigs_names = values[1:]
                    ref_contigs_fpath = os.path.join(
                        corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta')
                    if ref_name not in aligned_contigs_for_each_ref:
                        aligned_contigs_for_each_ref[ref_name] = []

                    for (cont_name, seq) in contigs_seq:
                        if not cont_name in contigs:
                            contigs[cont_name] = seq

                        if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]:
                            # Collecting all aligned contigs names in order to further extract not aligned
                            aligned_contig_names.add(cont_name)
                            aligned_contigs_for_each_ref[ref_name].append(cont_name)
                            fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a')

                    ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                    if ref_asm.name not in added_ref_asm:
                        if ref_name in assemblies_by_ref:
                            assemblies_by_ref[ref_name].append(ref_asm)
                            added_ref_asm.append(ref_asm.name)
        if qconfig.space_efficient:
            os.remove(alignments_fpath)

    # Extraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
예제 #19
0
def preprocess_reference(ref_fpath, tmp_dir, uncovered_fpath):
    uncovered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=False)
    splitted_fasta = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        if name in uncovered_regions:
            cur_contig_start = 0
            total_contigs = 0
            for start, end in uncovered_regions[name]:
                total_contigs = split_by_ns(seq[cur_contig_start: start], name, splitted_fasta, total_contigs=total_contigs)
                cur_contig_start = end
            split_by_ns(seq[cur_contig_start:], name, splitted_fasta, total_contigs=total_contigs)
        else:
            split_by_ns(seq, name, splitted_fasta)
    processed_ref_fpath = join(tmp_dir, basename(ref_fpath))
    fastaparser.write_fasta(processed_ref_fpath, splitted_fasta)
    return processed_ref_fpath
예제 #20
0
    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
예제 #21
0
    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
예제 #22
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.warning('Skipping ' + original_fpath + ' because >sequence_name field is empty.',
                    indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    if not modified_fasta_entries:
        logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent='    ')
        return False

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = []  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
예제 #23
0
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a')
        elif downloaded_refs:
            logger.warning('Skipping ' + ref_fpath + ' because it'
                           ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!')
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error('Reference file ' + ref_fpath +
                         ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                         exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' + qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
예제 #24
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Generating Upper Bound Assembly...")

    if not reads_analyzer.compile_reads_analyzer_tools(logger):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...'
        )
        return None

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly on this platform '
            '(only linux64 and macOS are supported), skipping...')
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to install/download third-party repeat finding tool [Red]), skipping...'
        )
        return None

    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    if long_reads:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, mp_polished_suffix)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)
    already_done_fpath = check_prepared_optimal_assembly(
        insert_size, result_fpath, ref_prepared_optimal_assembly)
    if already_done_fpath:
        return already_done_fpath

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)

    if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size:
        calculated_insert_size = qconfig.optimal_assembly_insert_size
        result_fpath = result_fpath.replace('is' + str(insert_size),
                                            'is' + str(calculated_insert_size))
        prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace(
            'is' + str(insert_size), 'is' + str(calculated_insert_size))
        insert_size = calculated_insert_size
        ref_prepared_optimal_assembly = os.path.join(
            os.path.dirname(original_ref_fpath),
            prepared_optimal_assembly_basename)
        already_done_fpath = check_prepared_optimal_assembly(
            insert_size, result_fpath, ref_prepared_optimal_assembly)
        if already_done_fpath:
            return already_done_fpath

    log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath,
        tmp_dir,
        log_fpath,
        binary_fpath,
        insert_size,
        uncovered_fpath,
        use_long_reads=long_reads)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Upper Bound Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_bed(
            uncovered_fpath) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretical Upper Bound Assembly is saved to ' +
                result_fpath)
    logger.notice(
        '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n'
        '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). '
        'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n'
        '\t\tOR\n'
        '\tYou can copy ' + result_fpath + ' to ' +
        ref_prepared_optimal_assembly + '. '
        'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference ('
        + original_ref_fpath + ') and '
        'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size '
        + str(insert_size) + '), '
        'QUAST will reuse this Upper Bound Assembly.\n')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
예제 #25
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    references = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        references[name] = seq
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    ref_lens = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq in references.items():
        regions.setdefault(name, []).append([1, len(seq)])
        ref_lens[name] = len(seq)
        total_regions += 1
        total_reg_len += ref_lens[name]
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
예제 #26
0
def align_and_analyze(is_cyclic,
                      index,
                      contigs_fpath,
                      output_dirpath,
                      ref_fpath,
                      reference_chromosomes,
                      ns_by_chromosomes,
                      old_contigs_fpath,
                      bed_fpath,
                      threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stdout')
        log_err_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stderr')
        icarus_out_fpath = join(
            output_dirpath,
            qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.mis_contigs.info')
        unaligned_info_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = [
        'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous',
        'Best_group'
    ]
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' +
                    log_out_fpath + ' and ' + os.path.basename(log_err_fpath) +
                    '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(
        out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath,
                           contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error(
                    '  ' + qutils.index_to_str(index) +
                    'Failed aligning contigs ' +
                    qutils.label_from_fpath(contigs_fpath) +
                    ' to the reference (non-zero exit code). ' +
                    ('Run with the --debug flag to see additional information.'
                     if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Alignment failed for ' +
                    contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Alignment failed for ' + '\'' + assembly_label +
                            '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Nothing aligned for ' +
                    contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Nothing aligned for ' + '\'' + assembly_label +
                            '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n')  # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f,
                         misassembly_f=misassembly_f,
                         coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(
        ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {
        'SNPs': total_indels_info.mismatches,
        'indels_list': total_indels_info.indels_list,
        'total_aligned_bases': total_aligned_bases
    }
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath,
                           total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq)
                 for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(
            join(output_dirpath,
                 qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'),
            fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(
            output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(
            output_dirpath,
            qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' +
                     qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(
                                r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(
                                    contig)[0][0]
                                contig_cov = len_cov_pattern.findall(
                                    contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' +
                                                           str(aligned_len) +
                                                           '\t' + contig_cov +
                                                           '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
예제 #27
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_lens = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        ref_lens[name] = len(seq)
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in ref_lens.items():
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    # if qconfig.large_genome:
    #     log_out_f.write('Analyzing large blocks...\n')
    #     large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null'
    #     ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'),
    #                                coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w'))
    #     min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
    #     result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null',
    #                                   aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0])
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
예제 #28
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases}
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
예제 #29
0
def remove_low_complexity_tail(union_lines, union_alignments, single_transcript_lines, single_transcript_alignments,
                               transcript_seq, out_low_complexity_file, threshold_block_len, end_tail):
    block_seq = ''
    for i in range(len(union_alignments)):
        if end_tail == True:
            i_alignment = len(union_alignments) - 1 - i
        else:
            i_alignment = i
        psl_alignment = union_alignments[i_alignment]
        for j in range(psl_alignment.blocks_num):
            if end_tail == True:
                i_block = psl_alignment.blocks_num - 1 - j
            else:
                i_block = j

            start = psl_alignment.query_fragment.starts[i_block]
            end = psl_alignment.query_fragment.ends[i_block]

            if psl_alignment.strand == '+':
                block_seq += transcript_seq[start:end + 1]
            else:
                block_seq += UtilsGeneral.rev_comp(transcript_seq)[start:end + 1]

            if len(block_seq) < threshold_block_len:
                continue

            if not is_low_complexity(block_seq):
                if (end_tail == True and i_block == psl_alignment.blocks_num - 1) or (end_tail == False and i_block == 0):
                    latest_alignment = psl_alignment
                    latest_line = union_lines[i_alignment]
                else:
                    if end_tail == True:
                        latest_alignment = psl_alignment.get_split_alignment(0, i_block)
                    else:
                        latest_alignment = psl_alignment.get_split_alignment(i_block, psl_alignment.blocks_num - 1)
                    latest_line = latest_alignment.get_psl_line_from_alignment()

                if end_tail == True:
                    clear_union_alignments = union_alignments[:i_alignment] + [latest_alignment]
                    clear_union_lines = union_lines[:i_alignment] + [latest_line]
                else:
                    clear_union_alignments = [latest_alignment] + union_alignments[i_alignment + 1:]
                    clear_union_lines = [latest_line] + union_lines[i_alignment + 1:]

                # update single transcript lines / alignments:
                for i_alignment in range(len(union_lines)):
                    if union_lines[i_alignment] not in clear_union_lines:
                        single_transcript_lines.remove(union_lines[i_alignment])
                        single_transcript_alignments.remove(union_alignments[i_alignment])
                for i_alignment in range(len(clear_union_lines)):
                    if clear_union_lines[i_alignment] not in single_transcript_lines:
                        single_transcript_lines.append(clear_union_lines[i_alignment])
                        single_transcript_alignments.append(clear_union_alignments[i_alignment])

                return clear_union_lines, clear_union_alignments, single_transcript_lines, single_transcript_alignments,

            else:
                fastaparser.write_fasta(out_low_complexity_file, [('{}_block'.format(psl_alignment.query_fragment.name), block_seq)], mode='a')

            if len(block_seq) > threshold_block_len:
                block_seq = ''

    # update single transcript lines / alignments:
    for i_alignment in range(len(union_lines)):
        single_transcript_lines.remove(union_lines[i_alignment])
        single_transcript_alignments.remove(union_alignments[i_alignment])

    return [], [], single_transcript_lines, single_transcript_alignments
예제 #30
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Simulating Optimal Assembly...")

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)
    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)

    if os.path.isfile(result_fpath) or os.path.isfile(
            ref_prepared_optimal_assembly):
        already_done_fpath = result_fpath if os.path.isfile(
            result_fpath) else ref_prepared_optimal_assembly
        logger.notice(
            '  Will reuse already generated Optimal Assembly with insert size %d (%s)'
            % (insert_size, already_done_fpath))
        return already_done_fpath

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Optimal Assembly on this platform, skipping...'
        )
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning('  Sorry, can\'t create Optimal Assembly, skipping...')
        return None

    log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size,
        uncovered_fpath)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Optimal Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_uncovered_fpath(
            uncovered_fpath, ref_fpath, return_covered_regions=False
        ) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretically optimal Assembly saved to ' +
                result_fpath)
    logger.notice(
        'You can copy it to ' + ref_prepared_optimal_assembly +
        ' and QUAST will reuse it in further runs against the same reference ('
        + original_ref_fpath + ')')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
예제 #31
0
파일: metautils.py 프로젝트: nwespe/quast
def correct_meta_references(ref_fpaths,
                            corrected_dirpath,
                            downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath,
                                      qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                     ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name,
                                    qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(
                uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references,
                ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' +
                             qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath,
                                    fastaparser.read_fasta(corr_seq_fpath),
                                    'a')
        elif downloaded_refs:
            logger.warning(
                'Skipping ' + ref_fpath + ' because it'
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!'
            )
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error(
                'Reference file ' + ref_fpath +
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' +
                         qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths