Пример #1
0
def _correct_refrences(ref_fpaths, corrected_dirpath):
    common_ref_fasta_ext = ''

    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME)

    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references):
        seq_fname = ref_name
        if total_references > 1:
            seq_fname += '_' + qutils.correct_name(seq_name[:20])
        seq_fname += ref_fasta_ext

        corr_seq_fpath = qutils.unique_corrected_fpath(
            os.path.join(corrected_dirpath, seq_fname))
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)

        corrected_ref_fpaths.append(corr_seq_fpath)

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)],
                                'a')

        return corr_seq_name

    for ref_fpath in ref_fpaths:
        total_references = 0
        for _ in fastaparser.read_fasta(ref_fpath):
            total_references += 1

        if total_references > 1:
            logger.info('  ' + ref_fpath + ':')

        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        common_ref_fasta_ext = ref_fasta_ext

        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            corr_seq_name = correct_seq(seq_name, seq, ref_name, ref_fasta_ext,
                                        total_references)
            if total_references > 1:
                logger.info('    ' + corr_seq_name + '\n')
            else:
                logger.info('  ' + ref_fpath + ' ==> ' + corr_seq_name + '')

    logger.info('  All references combined in ' + COMBINED_REF_FNAME)

    return corrected_ref_fpaths, common_ref_fasta_ext, combined_ref_fpath
Пример #2
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    heu_dirpath = os.path.join(tool_dirpath, 'heuristic_mod')

    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)
    for ind, seq in read_fasta(fasta_fpath):
        gc = min(70, max(30, gc_content(seq)))
        gc = gc - gc % 5  # rounds to a divisible by 5
        current_fname = str(gc) + '.fasta'
        current_fpath = os.path.join(tmp_dirpath, current_fname)
        with open(current_fpath, 'a') as current_file:
            current_file.write('>' + ind + '\n' + seq + '\n')

    genes = []
    _, _, fnames = os.walk(tmp_dirpath).next()
    for fname in fnames:
        sub_fasta_fpath = os.path.join(tmp_dirpath, fname)
        out_fpath = sub_fasta_fpath + '.gmhmm'

        gc_str, ext = os.path.splitext(fname)
        heu_fpath = os.path.join(heu_dirpath, 'heu_11_' + gc_str + '.mod')
        with open(err_fpath, 'a') as err_file:
            ok = gmhmm_p(tool_exec_fpath, sub_fasta_fpath, heu_fpath,
                       out_fpath, err_file, index)
            if ok:
                genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Пример #3
0
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath,
       ref_fpath, arcs=False, similar=False, coverage_hist=None):
    lists_of_aligned_blocks = []

    total_genome_size = 0
    reference_chromosomes = dict()
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        total_genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len
    virtual_genome_shift = int(0.1 * total_genome_size)
    sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True)
    sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True)
    cumulative_ref_lengths = [0]
    for length in sorted(reference_chromosomes.values(), reverse=True):
        cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length)
    virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift

    for contigs_fpath in contigs_fpaths:
        report_fpath = contig_report_fpath_pattern % qutils.name_from_fpath(contigs_fpath)
        aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths)
        if aligned_blocks is None:
            return None
        lists_of_aligned_blocks.append(aligned_blocks)

    plot_fpath = draw_alignment_plot(
        contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath,
        lists_of_aligned_blocks, arcs, similar, coverage_hist)
    return plot_fpath
Пример #4
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path,
               tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess([
                tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path
            ],
                                                 stdout=err_file,
                                                 stderr=err_file,
                                                 indent='  ' +
                                                 qutils.index_to_str(index) +
                                                 '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for ind, seq in read_fasta(fasta_fpath):
        ind = re.sub('[/. ]', '_', ind)
        contig_path = os.path.join(base_dir, ind + '.fasta')
        gff_path = os.path.join(base_dir, ind + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff')
    unique, total = set(), 0
    genes = []
    cnt = [0] * len(gene_lengths)
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start:end + 1]
        else:
            gene_seq = rev_comp(contigs[contig][start:end + 1])
        if gene_seq not in unique:
            unique.add(gene_seq)
        genes.append((gene_id, gene_seq))
        for idx, gene_length in enumerate(gene_lengths):
            cnt[idx] += end - start > gene_length

    if OUTPUT_FASTA:
        out_fasta_path = out_fpath + '_genes.fasta'
        write_fasta(out_fasta_path, genes)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, len(unique), total, cnt
Пример #5
0
def _correct_refrences(ref_fpaths, corrected_dirpath):
    common_ref_fasta_ext = ''

    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME)

    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references):
        seq_fname = ref_name
        if total_references > 1:
            seq_fname += '_' + qutils.correct_name(seq_name[:20])
        seq_fname += ref_fasta_ext

        corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)

        corrected_ref_fpaths.append(corr_seq_fpath)

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        return corr_seq_name

    for ref_fpath in ref_fpaths:
        total_references = 0
        for _ in fastaparser.read_fasta(ref_fpath):
            total_references += 1

        if total_references > 1:
            logger.info('  ' + ref_fpath + ':')

        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        common_ref_fasta_ext = ref_fasta_ext

        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            corr_seq_name = correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references)
            if total_references > 1:
                logger.info('    ' + corr_seq_name + '\n')
            else:
                logger.info('  ' + ref_fpath + ' ==> ' + corr_seq_name + '')

    logger.info('  All references combined in ' + COMBINED_REF_FNAME)

    return corrected_ref_fpaths, common_ref_fasta_ext, combined_ref_fpath
Пример #6
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for ind, seq in read_fasta(fasta_fpath):
        contig_path = os.path.join(base_dir, ind + '.fasta')
        gff_path = os.path.join(base_dir, ind + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        logger.error(
            'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
            ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath))
        return None, None, None, None

    out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff')
    unique, total = set(), 0
    genes = []
    cnt = [0] * len(gene_lengths)
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start:end + 1]
        else:
            gene_seq = rev_comp(contigs[contig][start:end + 1])
        if gene_seq not in unique:
            unique.add(gene_seq)
        genes.append((gene_id, gene_seq))
        for idx, gene_length in enumerate(gene_lengths):
            cnt[idx] += end - start > gene_length

    if OUTPUT_FASTA:
        out_fasta_path = out_fpath + '_genes.fasta'
        write_fasta(out_fasta_path, genes)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, len(unique), total, cnt
Пример #7
0
def do(contigs_fpaths,
       contig_report_fpath_pattern,
       output_dirpath,
       ref_fpath,
       cov_fpath=None,
       arcs=False,
       similar=False,
       coverage_hist=None):
    make_output_dir(output_dirpath)

    lists_of_aligned_blocks = []

    total_genome_size = 0
    reference_chromosomes = dict()
    chr_names = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_names.append(chr_name)
        chr_len = len(seq)
        total_genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len
    virtual_genome_shift = 100
    sorted_ref_names = sorted(reference_chromosomes,
                              key=reference_chromosomes.get,
                              reverse=True)
    sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True)
    cumulative_ref_lengths = [0]
    for length in sorted(reference_chromosomes.values(), reverse=True):
        cumulative_ref_lengths.append(cumulative_ref_lengths[-1] +
                                      virtual_genome_shift + length)
    virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift

    for contigs_fpath in contigs_fpaths:
        report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(
            contigs_fpath)
        aligned_blocks = parse_nucmer_contig_report(report_fpath,
                                                    sorted_ref_names,
                                                    cumulative_ref_lengths)
        if aligned_blocks is None:
            return None
        for block in aligned_blocks:
            block.label = qutils.name_from_fpath(contigs_fpath)
        lists_of_aligned_blocks.append(aligned_blocks)

    plot_fpath, assemblies = draw_alignment_plot(
        contigs_fpaths, virtual_genome_size, sorted_ref_names,
        sorted_ref_lengths, virtual_genome_shift, output_dirpath,
        lists_of_aligned_blocks, arcs, similar, coverage_hist)
    if assemblies and qconfig.create_contig_alignment_html:
        js_data_gen(assemblies, contigs_fpaths, chr_names,
                    reference_chromosomes, output_dirpath, cov_fpath,
                    ref_fpath, virtual_genome_size)

    return plot_fpath
Пример #8
0
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath,
                       alignments_fpath_template):
    # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere')
    # if os.path.isdir(not_aligned_anywhere_dirpath):
    #     os.rmdir(not_aligned_anywhere_dirpath)
    # os.mkdir(not_aligned_anywhere_dirpath)

    not_aligned_assemblies = []
    # array of assemblies for each reference
    assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), [])
                              for ref_fpath in ref_fpaths])

    for asm in assemblies:
        not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta'
        not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
        contigs = {}
        aligned_contig_names = set()

        with open(alignments_fpath_template % asm.name) as alignments_tsv_f:
            for line in alignments_tsv_f:
                values = line.split()
                ref_name = values[0]
                ref_contigs_names = values[1:]
                ref_contigs_fpath = os.path.join(
                    corrected_dirpath,
                    asm.name + '_to_' + ref_name[:40] + '.fasta')

                for (cont_name, seq) in fastaparser.read_fasta(asm.fpath):
                    if not cont_name in contigs.keys():
                        contigs[cont_name] = seq

                    if cont_name in ref_contigs_names:
                        # Collecting all aligned contigs names in order to futher extract not-aligned
                        aligned_contig_names.add(cont_name)
                        fastaparser.write_fasta(ref_contigs_fpath,
                                                [(cont_name, seq)], 'a')

                ref_asm = Assembly(ref_contigs_fpath, asm.label)
                assemblies_by_ref[ref_name].append(ref_asm)

        # Exctraction not aligned contigs
        all_contigs_names = set(contigs.keys())
        not_aligned_contigs_names = all_contigs_names - aligned_contig_names
        fastaparser.write_fasta(not_aligned_fpath,
                                [(name, contigs[name])
                                 for name in not_aligned_contigs_names])

        not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
        not_aligned_assemblies.append(not_aligned_asm)

    return assemblies_by_ref, not_aligned_assemblies
Пример #9
0
    def hasScaffolds(self, assembler):
        result = self._has_scaffolds_cache.get(assembler)
        if result is not None:
            return result
        
        result = False
        if assembler == "spades":
            scaffolds_fn = self._info[assembler]['scaffolds']
            for name, seq in read_fasta(scaffolds_fn):
                if 'N' in seq:
                    result = True
                    break

        self._has_scaffolds_cache[assembler] = result
        return result
Пример #10
0
    def hasScaffolds(self, assembler):
        result = self._has_scaffolds_cache.get(assembler)
        if result is not None:
            return result

        result = False
        if assembler == "spades":
            scaffolds_fn = self._info[assembler]['scaffolds']
            for name, seq in read_fasta(scaffolds_fn):
                if 'N' in seq:
                    result = True
                    break

        self._has_scaffolds_cache[assembler] = result
        return result
Пример #11
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            # seq to uppercase, because we later looking only uppercase letters
            corr_seq = seq.upper()

            # correcting alternatives (gage can't work with alternatives)
            # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

            # make sure that only A, C, G, T or N are in the sequence
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return False

            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_LENGTH:
            _, fasta_ext = os.path.splitext(corrected_fpath)
            splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref')
            os.makedirs(splitted_ref_dirpath)

            for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries):
                if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext
                qconfig.splitted_ref.append(splitted_ref_fpath)
                fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)])

            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Пример #12
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            # seq to uppercase, because we later looking only uppercase letters
            corr_seq = seq.upper()

            # correcting alternatives (gage can't work with alternatives)
            # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

            # make sure that only A, C, G, T or N are in the sequence
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return False

            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_LENGTH:
            _, fasta_ext = os.path.splitext(corrected_fpath)
            splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref')
            os.makedirs(splitted_ref_dirpath)

            for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries):
                if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext
                qconfig.splitted_ref.append(splitted_ref_fpath)
                fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)])

            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Пример #13
0
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template):
    # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere')
    # if os.path.isdir(not_aligned_anywhere_dirpath):
    #     os.rmdir(not_aligned_anywhere_dirpath)
    # os.mkdir(not_aligned_anywhere_dirpath)

    not_aligned_assemblies = []
    # array of assemblies for each reference
    assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths])

    for asm in assemblies:
        not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta'
        not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
        contigs = {}
        aligned_contig_names = set()

        for line in open(alignments_fpath_template % asm.name):
            values = line.split()
            ref_name = values[0]
            ref_contigs_names = values[1:]
            ref_contigs_fpath = os.path.join(
                corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta')

            for (cont_name, seq) in fastaparser.read_fasta(asm.fpath):
                if not cont_name in contigs.keys():
                    contigs[cont_name] = seq

                if cont_name in ref_contigs_names:
                    # Collecting all aligned contigs names in order to futher extract not-aligned
                    aligned_contig_names.add(cont_name)
                    fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a')

            ref_asm = Assembly(ref_contigs_fpath, asm.label)
            assemblies_by_ref[ref_name].append(ref_asm)

        # Exctraction not aligned contigs
        all_contigs_names = set(contigs.keys())
        not_aligned_contigs_names = all_contigs_names - aligned_contig_names
        fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names])

        not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
        not_aligned_assemblies.append(not_aligned_asm)

    return assemblies_by_ref, not_aligned_assemblies
Пример #14
0
def do(contigs_fpaths,
       contig_report_fpath_pattern,
       output_dirpath,
       ref_fpath,
       arcs=False,
       similar=False,
       coverage_hist=None):
    lists_of_aligned_blocks = []

    total_genome_size = 0
    reference_chromosomes = dict()
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        total_genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len
    virtual_genome_shift = int(0.1 * total_genome_size)
    sorted_ref_names = sorted(reference_chromosomes,
                              key=reference_chromosomes.get,
                              reverse=True)
    sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True)
    cumulative_ref_lengths = [0]
    for length in sorted(reference_chromosomes.values(), reverse=True):
        cumulative_ref_lengths.append(cumulative_ref_lengths[-1] +
                                      virtual_genome_shift + length)
    virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift

    for contigs_fpath in contigs_fpaths:
        report_fpath = contig_report_fpath_pattern % qutils.name_from_fpath(
            contigs_fpath)
        aligned_blocks = parse_nucmer_contig_report(report_fpath,
                                                    sorted_ref_names,
                                                    cumulative_ref_lengths)
        if aligned_blocks is None:
            return None
        lists_of_aligned_blocks.append(aligned_blocks)

    plot_fpath = draw_alignment_plot(contigs_fpaths, virtual_genome_size,
                                     sorted_ref_names, sorted_ref_lengths,
                                     virtual_genome_shift, output_dirpath,
                                     lists_of_aligned_blocks, arcs, similar,
                                     coverage_hist)
    return plot_fpath
Пример #15
0
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath,
       ref_fpath, cov_fpath=None, arcs=False, similar=False, coverage_hist=None):
    make_output_dir(output_dirpath)

    lists_of_aligned_blocks = []

    total_genome_size = 0
    reference_chromosomes = dict()
    chr_names = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_names.append(chr_name)
        chr_len = len(seq)
        total_genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len
    virtual_genome_shift = 100
    sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True)
    sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True)
    cumulative_ref_lengths = [0]
    for length in sorted(reference_chromosomes.values(), reverse=True):
        cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length)
    virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift

    for contigs_fpath in contigs_fpaths:
        report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath)
        aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths)
        if aligned_blocks is None:
            return None
        for block in aligned_blocks:
            block.label = qutils.name_from_fpath(contigs_fpath)
        lists_of_aligned_blocks.append(aligned_blocks)

    plot_fpath, assemblies = draw_alignment_plot(
        contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath,
        lists_of_aligned_blocks, arcs, similar, coverage_hist)
    if assemblies and qconfig.create_contig_alignment_html:
        js_data_gen(assemblies, contigs_fpaths, chr_names, reference_chromosomes, output_dirpath, cov_fpath, ref_fpath, virtual_genome_size)

    return plot_fpath
Пример #16
0
REF_MARGINS = 300
REF_FNAME = "ref.fa"

if len(sys.argv) != 4:
    print "Usage:", sys.argv[0], "reference pos1 pos2"
    sys.exit(0)

pos1 = int(sys.argv[2])
pos2 = int(sys.argv[3])

if pos1 > pos2:
    pos = pos1
    pos1 = pos2
    pos2 = pos

reference = fastaparser.read_fasta(
    sys.argv[1])[0][1]  # Returns list of FASTA entries (in tuples: name, seq)
if len(reference) < pos2:
    pos2 = len(reference)

ref_file = open(REF_FNAME, 'w')
ref_file.write(">reference\n")
ref_file.write(reference[max(0, pos1 - 1 -
                             REF_MARGINS):min(len(reference), pos2 +
                                              REF_MARGINS)] + "\n")
ref_file.close()

misassembled_site = reference[pos1 - 1:pos2]
kmers = set()

i = pos1 - 1
while i + KMER_SIZE <= pos2:
Пример #17
0
def _correct_references(ref_fpaths, corrected_dirpath):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME)

    chromosomes_by_refs = {}

    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                    ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)
        corr_seq_name += '_' + qutils.correct_name(seq_name[:20])
        if not qconfig.no_check:
            corr_seq = seq.upper()
            dic = {
                'M': 'N',
                'K': 'N',
                'R': 'N',
                'Y': 'N',
                'W': 'N',
                'S': 'N',
                'V': 'N',
                'B': 'N',
                'H': 'N',
                'D': 'N'
            }
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + ref_fpath +
                               ' because it contains non-ACGTN characters.',
                               indent='    ')
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)],
                                'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)
    dupl_ref_names = [
        ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1
    ]

    for ref_fpath in ref_fpaths:
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        if ref_name in dupl_ref_names:
            ref_name = get_label_from_par_dir_and_fname(ref_fpath)

        chromosomes_by_refs[ref_name] = []

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            corr_seq_name, corr_seq_fpath = correct_seq(
                seq_name, seq, ref_name, ref_fasta_ext, total_references,
                ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' +
                             qutils.name_from_fpath(corr_seq_fpath) + '')

    logger.main_info('  All references combined in ' + COMBINED_REF_FNAME)

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Пример #18
0
    print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)")	
    print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)")	
    sys.exit()

BREAK_SCAFFOLDS = False
if len(sys.argv) == 4:
    BREAK_SCAFFOLDS = True

N_NUMBER = None
counter = 0
if BREAK_SCAFFOLDS:
    N_NUMBER = int(sys.argv[2])

sizes_of_Ns_regions = dict()
new_fasta = []
for id, (name, seq) in enumerate(fastaparser.read_fasta(sys.argv[1])): 
    i = 0
    cur_contig_number = 1
    cur_contig_start = 0
    while (i < len(seq)) and (seq.find("N", i) != -1):
        start = seq.find("N", i)
        end = start + 1
        while (end != len(seq)) and (seq[end] == 'N'):
            end += 1        

        i = end + 1
        if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER:
            new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start]))
            cur_contig_number += 1
            cur_contig_start = end
Пример #19
0
import sys
import os
sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '../'))
import libs
from libs import fastaparser

if len(sys.argv) <= 3 or len(sys.argv) >= 6:
    print("Returns [reverse-complement] sequence from START to END position from each entry of input fasta")
    print("Usage: " + sys.argv[0] + " <input fasta> <START> <END, -1 for the end> [any string -- optional parameter for reverse-complement]")
    sys.exit()

inp=sys.argv[1]
start=int(sys.argv[2])
end=int(sys.argv[3])
reverse = False
if len(sys.argv) == 5:
  reverse = True

for tup in fastaparser.read_fasta(inp):
    cur_start = min(start, len(tup[1]))
    if end == -1:
        cur_end = len(tup[1])
    else:
        cur_end = min(end, len(tup[1]))    
    print (">" + tup[0] + "_cropped_" + str(cur_start) + "_" + str(cur_end))
    if reverse:
        print (fastaparser.rev_comp(tup[1][cur_start - 1 : cur_end]))
    else:
        print (tup[1][cur_start - 1 : cur_end])    

Пример #20
0

# MAIN
if len(sys.argv) != 3:
    print("Usage: " + sys.argv[0] +
          " <input fasta> <contig id or file with list of contig ids>")
    sys.exit()

if os.path.isfile(sys.argv[2]):
    list_of_ids = []
    for line in open(sys.argv[2]):
        list_of_ids.append(line.strip())
else:
    list_of_ids = [sys.argv[2]]

origin_fasta = fastaparser.read_fasta(sys.argv[1])
dict_of_all_contigs = dict()
selected_contigs = []
for (name, seq) in origin_fasta:
    corr_name = get_corr_name(name)
    dict_of_all_contigs[corr_name] = seq

for name in list_of_ids:
    corr_name = get_corr_name(name)
    if corr_name in dict_of_all_contigs:
        selected_contigs.append((name, dict_of_all_contigs[corr_name]))
    else:
        print >> sys.stderr, "Contig", name, "(cor name:", corr_name, ") not found!"

for (name, seq) in selected_contigs:
    print '>' + name
Пример #21
0
REF_MARGINS = 300
REF_FNAME   = "ref.fa"

if len(sys.argv) != 4:
    print "Usage:", sys.argv[0], "reference pos1 pos2"
    sys.exit(0)

pos1 = int(sys.argv[2])
pos2 = int(sys.argv[3])

if pos1 > pos2:
    pos = pos1
    pos1 = pos2
    pos2 = pos

reference = fastaparser.read_fasta(sys.argv[1])[0][1]  # Returns list of FASTA entries (in tuples: name, seq)
if len(reference) < pos2:
    pos2 = len(reference)

ref_file = open(REF_FNAME, 'w')
ref_file.write(">reference\n")
ref_file.write(reference[max(0, pos1 - 1 - REF_MARGINS) : min(len(reference), pos2 + REF_MARGINS)] + "\n")
ref_file.close()

misassembled_site = reference[pos1 - 1 : pos2]
kmers = set()

i = pos1 - 1
while i + KMER_SIZE <= pos2:
    kmers.add(reference[i : i + KMER_SIZE])
    i += 1
Пример #22
0
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath,
                              labels):

    broken_scaffolds = None
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

    label = labels[file_counter]
    corr_fpath = qutils.unique_corrected_fpath(
        os.path.join(corrected_dirpath, label + fasta_ext))
    logs = []
    logs.append('  ' +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                '%s ==> %s' % (contigs_fpath, label))

    # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison
    if qconfig.scaffolds:
        logger.info(
            '  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
            '  breaking scaffolds into contigs:')
        corr_fpath_wo_ext = os.path.join(corrected_dirpath,
                                         qutils.name_from_fpath(corr_fpath))
        broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
        broken_scaffolds_fasta = []
        contigs_counter = 0

        scaffold_counter = 0
        for scaffold_counter, (name, seq) in enumerate(
                fastaparser.read_fasta(contigs_fpath)):
            if contigs_counter % 100 == 0:
                pass
            if contigs_counter > 520:
                pass
            cumul_contig_length = 0
            total_contigs_for_the_scaf = 1
            cur_contig_start = 0
            while (cumul_contig_length < len(seq)) and (seq.find(
                    'N', cumul_contig_length) != -1):
                start = seq.find("N", cumul_contig_length)
                end = start + 1
                while (end != len(seq)) and (seq[end] == 'N'):
                    end += 1

                cumul_contig_length = end + 1
                if (end - start) >= qconfig.Ns_break_threshold:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                    cur_contig_start = end

            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" + str(total_contigs_for_the_scaf),
                 seq[cur_contig_start:]))

            contigs_counter += total_contigs_for_the_scaf
        if scaffold_counter + 1 != contigs_counter:
            fastaparser.write_fasta(broken_scaffolds_fpath,
                                    broken_scaffolds_fasta)
            logs.append(
                "  " +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                (scaffold_counter + 1, label, contigs_counter,
                 label + ' broken'))
            broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath)
        else:
            logs.append(
                "  " +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis"
                % label)

    corr_fpaths = (contigs_fpath, corr_fpath)
    return corr_fpaths, broken_scaffolds, logs
Пример #23
0
Файл: quast.py Проект: ctb/quast
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = seq.upper()

                # correcting alternatives (gage can't work with alternatives)
                # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
                dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
                pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
                corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

                # make sure that only A, C, G, T or N are in the sequence
                if re.compile(r'[^ACGTN]').search(corr_seq):
                    logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.',
                            indent='    ')
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = []  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Пример #24
0
Файл: quast.py Проект: ctb/quast
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels):

    broken_scaffolds = None
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

    label = labels[file_counter]
    corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
    logs = []
    logs.append('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label))

    # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison
    if qconfig.scaffolds:
        logger.info('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
        corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
        broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
        broken_scaffolds_fasta = []
        contigs_counter = 0

        scaffold_counter = 0
        for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
            if contigs_counter % 100 == 0:
                pass
            if contigs_counter > 520:
                pass
            cumul_contig_length = 0
            total_contigs_for_the_scaf = 1
            cur_contig_start = 0
            while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
                start = seq.find("N", cumul_contig_length)
                end = start + 1
                while (end != len(seq)) and (seq[end] == 'N'):
                    end += 1

                cumul_contig_length = end + 1
                if (end - start) >= qconfig.Ns_break_threshold:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                    cur_contig_start = end

            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" +
                 str(total_contigs_for_the_scaf),
                 seq[cur_contig_start:]))

            contigs_counter += total_contigs_for_the_scaf
        if scaffold_counter + 1 != contigs_counter:
            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            logs.append("  " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                        "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (scaffold_counter + 1,
                         label,
                         contigs_counter,
                         label + ' broken'))
            broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath)
        else:
            logs.append("  " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)

    corr_fpaths = (contigs_fpath, corr_fpath)
    return corr_fpaths, broken_scaffolds, logs
Пример #25
0
def correct_fasta(original_fpath,
                  corrected_fpath,
                  min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = seq.upper()

                # correcting alternatives (gage can't work with alternatives)
                # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
                dic = {
                    'M': 'N',
                    'K': 'N',
                    'R': 'N',
                    'Y': 'N',
                    'W': 'N',
                    'S': 'N',
                    'V': 'N',
                    'B': 'N',
                    'H': 'N',
                    'D': 'N'
                }
                pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
                corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

                # make sure that only A, C, G, T or N are in the sequence
                if re.compile(r'[^ACGTN]').search(corr_seq):
                    logger.warning(
                        'Skipping ' + original_fpath +
                        ' because it contains non-ACGTN characters.',
                        indent='    ')
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(
            len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = [
            ]  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath),
                                             'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len / qconfig.max_threads,
                          qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath,
                                          "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name +
                                   " because its length is greater than " +
                                   str(qconfig.MAX_REFERENCE_LENGTH) +
                                   " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(
                        split_ref_dirpath,
                        "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)],
                                        mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning(
                    "Skipping reference because all of its chromosomes exceeded Nucmer's constraint."
                )
                return False
    return True
Пример #26
0
def _correct_references(ref_fpaths, corrected_dirpath):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME)

    chromosomes_by_refs = {}

    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)
        corr_seq_name += '_' + qutils.correct_name(seq_name[:20])
        if not qconfig.no_check:
            corr_seq = seq.upper()
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)
    dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1]

    for ref_fpath in ref_fpaths:
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        if ref_name in dupl_ref_names:
            ref_name = get_label_from_par_dir_and_fname(ref_fpath)
            
        chromosomes_by_refs[ref_name] = []

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            corr_seq_name, corr_seq_fpath = correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')

    logger.main_info('  All references combined in ' + COMBINED_REF_FNAME)

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Пример #27
0
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels):
    ## removing from contigs' names special characters because:
    ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc
    ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names)
    corrected_contigs_fpaths = []

    for i, contigs_fpath in enumerate(contigs_fpaths):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        label = labels[i]
        corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
        qconfig.assembly_labels_by_fpath[corr_fpath] = label
        logger.info('  %s ==> %s' % (contigs_fpath, label))

        # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison
        if qconfig.scaffolds:
            logger.info("  breaking scaffolds into contigs:")
            corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
            broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
            broken_scaffolds_fasta = []
            contigs_counter = 0

            for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
                i = 0
                cur_contig_number = 1
                cur_contig_start = 0
                while (i < len(seq)) and (seq.find("N", i) != -1):
                    start = seq.find("N", i)
                    end = start + 1
                    while (end != len(seq)) and (seq[end] == 'N'):
                        end += 1

                    i = end + 1
                    if (end - start) >= qconfig.Ns_break_threshold:
                        broken_scaffolds_fasta.append(
                            (name.split()[0] + "_" +
                             str(cur_contig_number),
                             seq[cur_contig_start:start]))
                        cur_contig_number += 1
                        cur_contig_start = end

                broken_scaffolds_fasta.append(
                    (name.split()[0] + "_" +
                     str(cur_contig_number),
                     seq[cur_contig_start:]))

                contigs_counter += cur_contig_number

            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken'
            logger.info("      %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (i + 1,
                         qutils.name_from_fpath(corr_fpath),
                         contigs_counter,
                         qutils.name_from_fpath(broken_scaffolds_fpath)))

            if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting):
                corrected_contigs_fpaths.append(broken_scaffolds_fpath)
                qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath))

        if _handle_fasta(contigs_fpath, corr_fpath, reporting):
            corrected_contigs_fpaths.append(corr_fpath)

    return corrected_contigs_fpaths
Пример #28
0
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels):
    ## removing from contigs' names special characters because:
    ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc
    ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names)
    corrected_contigs_fpaths = []

    for i, contigs_fpath in enumerate(contigs_fpaths):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        label = labels[i]
        corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
        qconfig.assembly_labels_by_fpath[corr_fpath] = label
        logger.info('  %s ==> %s' % (contigs_fpath, label))

        # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison
        if qconfig.scaffolds:
            logger.info("  breaking scaffolds into contigs:")
            corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
            broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
            broken_scaffolds_fasta = []
            contigs_counter = 0

            for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
                i = 0
                cur_contig_number = 1
                cur_contig_start = 0
                while (i < len(seq)) and (seq.find("N", i) != -1):
                    start = seq.find("N", i)
                    end = start + 1
                    while (end != len(seq)) and (seq[end] == 'N'):
                        end += 1

                    i = end + 1
                    if (end - start) >= qconfig.Ns_break_threshold:
                        broken_scaffolds_fasta.append(
                            (name.split()[0] + "_" +
                             str(cur_contig_number),
                             seq[cur_contig_start:start]))
                        cur_contig_number += 1
                        cur_contig_start = end

                broken_scaffolds_fasta.append(
                    (name.split()[0] + "_" +
                     str(cur_contig_number),
                     seq[cur_contig_start:]))

                contigs_counter += cur_contig_number

            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken'
            logger.info("      %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (i + 1,
                         qutils.name_from_fpath(corr_fpath),
                         contigs_counter,
                         qutils.name_from_fpath(broken_scaffolds_fpath)))

            if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting):
                corrected_contigs_fpaths.append(broken_scaffolds_fpath)
                qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath))

        if _handle_fasta(contigs_fpath, corr_fpath, reporting):
            corrected_contigs_fpaths.append(corr_fpath)

    return corrected_contigs_fpaths
Пример #29
0
    return qutils.correct_name(name)
    # return re.sub(r'\W', '', re.sub(r'\s', '_', name))

# MAIN
if len(sys.argv) != 3:
    print("Usage: " + sys.argv[0] + " <input fasta> <contig id or file with list of contig ids>")
    sys.exit()

if os.path.isfile(sys.argv[2]):
    list_of_ids = []
    for line in open(sys.argv[2]):
        list_of_ids.append(line.strip())
else:
    list_of_ids = [sys.argv[2]]

origin_fasta = fastaparser.read_fasta(sys.argv[1])
dict_of_all_contigs = dict()
selected_contigs = []
for (name, seq) in origin_fasta:
    corr_name = get_corr_name(name)
    dict_of_all_contigs[corr_name] = seq

for name in list_of_ids:
    corr_name = get_corr_name(name)
    if corr_name in dict_of_all_contigs:
        selected_contigs.append((name, dict_of_all_contigs[corr_name]))
    else:
        print >> sys.stderr, "Contig", name, "(cor name:", corr_name, ") not found!"

for (name, seq) in selected_contigs:
    print '>' + name