Пример #1
0
def dbgtilesummary(dbg_path, tiles, orfs, output):
    """Compute summary statistics on a de Bruin graph (DBG)

    The output is written out as a YAML. This operation computes statistics of
    an ORF set and a tile set relative to a DBG, which is why it requires those
    files too.

    """
    try:
        import networkx as nx

        from pepsyn.dbg import dbg_stats
    except ImportError:
        print("dbgtilesummary requires NetworkX", file=sys.stderr)
        raise Abort()

    dbg = nx.read_gpickle(dbg_path)
    with open(orfs, "r") as ip:
        orfs = [seq for (name, seq, qual) in readfq(ip)]
    with open(tiles, "r") as ip:
        tiles = [seq for (name, seq, qual) in readfq(ip)]

    stats = dbg_stats(dbg, orfs, tiles)

    print(yaml.dump(stats), file=output)
Пример #2
0
def tilesummary(tiles, orfs, output):
    """Compute summary statistics on a set of peptide tiles

    These statistics are computed relative to a set of ORFs. This operation can
    be used on raw or cleaned ORFs.

    """
    with open(orfs, "r") as ip:
        orfs = {name: seq for (name, seq, qual) in readfq(ip)}
    with open(tiles, "r") as ip:
        tiles = {name: seq for (name, seq, qual) in readfq(ip)}
    stats = tile_stats(orfs, tiles)
    print(yaml.dump(stats), file=output)
Пример #3
0
def disambiguateaa(input, output):
    """Replace IUPAC ambiguous amino acids with unambiguous ones

    Specifically, make the following replacements:
    B => DN
    X => ACDEFGHIKLMNPQRSTVWY
    Z => EQ
    J => LI,
    U => C (selenocysteine)
    O => K (pyrrolysine)

    If there are multiple possible replacements, this operation will output a
    sequence for each possible option. Use caution with sequences that are
    highly ambiguous (e.g., with many Xs), as in this case a single sequence
    could lead to an explosion in the output.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, ambig, qual) in readfq(input):
        n = num_disambiguated_iupac_aa(ambig)
        digits = floor(log10(n)) + 1
        fmt = f"{name}|disambig_{{:0{digits}d}}"
        for (i, unambig) in enumerate(disambiguate_iupac_aa(ambig)):
            if n > 1:
                name = fmt.format(i + 1)
            print(f">{name}\n{unambig}", file=output)
Пример #4
0
def fasta_handle_to_dbg(fasta_handle, k, tqdm=None, ignore_short=False):
    return name_seq_pairs_to_dbg(
        ((name, seq) for (name, seq, qual) in readfq(fasta_handle)),
        k,
        tqdm,
        ignore_short,
    )
Пример #5
0
def suffix(input, output, suffix):
    """Add a suffix to each sequence

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, seq, qual) in readfq(input):
        newseq = seq + suffix
        print(f">{name}\n{newseq}", file=output)
Пример #6
0
def clip(input, output, left, right):
    """Clip/truncate bases from the ends of each sequence

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, seq, qual) in readfq(input):
        stop = len(seq) - right
        print(f">{name}\n{seq[left:stop]}", file=output)
Пример #7
0
def filterstop(input, output):
    """Filter out input sequences that contain stop codons (*).

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, seq, qual) in readfq(input):
        if "*" in seq:
            continue
        print(f">{name}\n{seq}", file=output)
Пример #8
0
def orfsummary(input, output):
    """Compute summary statistics on a set of ORFs

    Can be used on raw or cleaned orfs.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    orfs = {name: seq for (name, seq, qual) in readfq(input)}
    stats = orf_stats(orfs)
    print(yaml.dump(stats), file=output)
Пример #9
0
def stripstop(input, output):
    """Strip stop "codons" from end of input protein sequences.

    Stop codons are assumed to be represented as "*".

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, seq, qual) in readfq(input):
        seq = seq.rstrip("*")
        print(f">{name}\n{seq}", file=output)
Пример #10
0
def x2ggsg(input, output):
    """Replace stretches of Xs with Serine-Glycine linker (in a GGSG pattern)

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, seq, qual) in readfq(input):
        replacement = x_to_ggsg(seq)
        if replacement != seq:
            output_title = f"{name}|withGSlinker"
        else:
            output_title = name
        print(f">{output_title}\n{replacement}", file=output)
Пример #11
0
def filterlen(input, output, min_len, max_len):
    """Filter sequences of a given length.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    if max_len is None:
        max_len = inf
    for (name, seq, qual) in readfq(input):
        if len(seq) < min_len:
            continue
        if len(seq) > max_len:
            continue
        print(f">{name}\n{seq}", file=output)
Пример #12
0
def uniq(input, output):
    """Filter out duplicate sequences.

    Only takes account of the sequences themselves. Arbitrarily picks one. This
    requires loading the entire file into RAM.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    seqs = {}
    for (name, seq, qual) in readfq(input):
        if seq not in seqs:
            seqs[seq] = name
    for (seq, name) in seqs.items():
        print(f">{name}\n{seq}", file=output)
Пример #13
0
def pad(input, output, length, nterm):
    """Pad protein sequence to a specified length by adding amino acids in the
    pattern of "GSGG".

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    terminus = "N" if nterm else "C"
    for (name, seq, qual) in readfq(input):
        padded = pad_ggsg(seq, length, terminus)
        pad_len = len(padded) - len(seq)
        if pad_len > 0:
            output_title = f"{name}|{terminus}-PADDED-{pad_len}"
        else:
            output_title = name
        print(f">{output_title}\n{padded}", file=output)
Пример #14
0
def tile(input, output, length, overlap):
    """Generate short (overlapping) sequence tiles from input sequences.

    Each sequence in the fasta input is converted into short tiles with given
    length and overlap and written out in fasta format.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    Note: this tool drops "incomplete/short" last tiles if the length/overlap
    setting does not allow a tiling to perfectly cover a sequence. We recommend
    using ``ctermpep`` to explicitly capture the last tiles.

    """
    for (name, seq, qual) in tqdm(readfq(input), desc="tile", unit="seq"):
        for (start, end, t) in tile_op(seq, length, overlap):
            output_title = f"{name}|{start}-{end}"
            print(f">{output_title}\n{t}", file=output)
Пример #15
0
def ctermpep(input, output, length, add_stop):
    """Extract the C-terminal peptide from each input sequence

    If an input sequence is shorter than the specified length, it will write
    out the entirety of the sequence.

    With the ``--add-stop`` option, an asterisk is appended to the input
    sequence and counts as one of the amino acids in terms of peptide length.
    For example, if requesting 56-aa peptides with a stop codon, the output
    will code for 55 amino acids and the stop.

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    for (name, seq, qual) in tqdm(readfq(input), desc="ctermpep", unit="seq"):
        oligo = cterm_oligo(seq, length, add_stop=add_stop)
        output_title = f"{name}|CTERM"
        if add_stop:
            output_title = f"{output_title}|STOP"
        print(f">{output_title}\n{oligo}", file=output)
Пример #16
0
def findsite(input, site, clip_left, clip_right):
    """Find locations of a site in a DNA sequences

    If a sequence matches the specified site, write out its name and location.
    Used as a diagnostic to confirm that a particular DNA site (e.g.,
    restriction enzyme) is absent from a set of sequences. Because there may be
    adaptor sequences that contain such a site by design, the clipping option
    allows the search to be restricted. Note that a site is searched if it
    overlaps with the valid region even by one base (i.e., a site can match if
    it is mostly outside the clipped region, as long as it overlaps the target
    search region).

    INPUT is a path to fasta file or "-" to specify STDIN.

    """
    query = str(site2dna(site))
    for (name, seq, qual) in readfq(input):
        start = clip_left
        end = len(seq) - clip_right
        idx = seq[start:end].find(query)
        if idx >= 0:
            print(f"{name}|{site}|{idx + start}", file=sys.stdout)
Пример #17
0
def greedykmercov(
    input, output, tile_size, dbg_path, kmer_cov, num_tiles, preselected_tiles_path
):
    """Select protein tiles from de Bruijn graph by maximizing k-mer coverage

    Each tile is a fragment of an observed input ORF. Either the total number of
    output tiles can be specified, or the average target k-mer coverage. If
    there is already a pre-selected set of tiles chosen through some other
    method, specifying them will initialize the de Bruijn graph to reflect the
    preexisting k-mer coverage.

    NOTE: ORFS shorter than tile-size are sampled, but ORFs shorter than
    kmer-size are ignored. (Use pepsyn filterlen to select short tiles.)

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    # test input/context
    try:
        import networkx as nx
        from pepsyn.dbg import gen_kmers, setreduce_attr, sum_attr
    except ImportError:
        raise Abort("greedykmercov requires NetworkX")
    try:
        import numpy as np
    except ImportError:
        raise Abort("greedykmercov requires NumPy")
    if kmer_cov and num_tiles:
        raise UsageError("Set -c/--kmer-cov OR -n/--num-tiles but not both")
    if not kmer_cov and not num_tiles:
        raise UsageError("Must set one of -c/--kmer-cov OR -n/--num-tiles")

    # load orfs
    orfs = {name: seq for (name, seq, qual) in readfq(input)}

    # load dbg
    dbg = nx.read_gpickle(dbg_path)
    kmer_size = len(next(iter(dbg)))
    if kmer_size > tile_size:
        raise UsageError("kmer-size > tile_size")
    kmers_remaining = len(dbg)
    num_components = nx.number_weakly_connected_components(dbg)
    if num_tiles:
        tiles_remaining = num_tiles

    # load preselected tiles
    preselected_tiles = [seq for (name, seq, qual) in readfq(preselected_tiles_path)]
    preselected_kmer_counts = Counter(
        [
            kmer
            for tile in preselected_tiles
            for kmer in gen_kmers(tile, kmer_size, yield_short=True)
        ]
    )

    # process each graph component separately
    component_iter = tqdm(
        nx.weakly_connected_components(dbg),
        unit="comp",
        desc="dbg components",
        total=num_components,
    )
    for component in component_iter:
        component_orfs = setreduce_attr(dbg, component, "orf")

        # generate all candidate tiles
        tile_to_name = {}
        for name in tqdm(component_orfs, desc="generating tiles"):
            # special case short orfs
            if len(orfs[name]) < tile_size:
                tile_to_name.setdefault(orfs[name], []).append(
                    (name, 0, len(orfs[name]))
                )
            for (i, j, tile) in tile_op(orfs[name], tile_size, tile_size - 1):
                tile_to_name.setdefault(tile, []).append((name, i, j))
        candidate_tiles = list(tile_to_name.keys())

        # generate init tile scores
        tile_scores = []
        tile_lens = []
        kmer_to_idxs = {}
        for idx, tile in enumerate(tqdm(candidate_tiles, desc="init tile scores")):
            score = 0
            for kmer in set(gen_kmers(tile, kmer_size)):
                score += dbg.nodes[kmer]["multiplicity"]
                kmer_to_idxs.setdefault(kmer, set()).add(idx)
            tile_scores.append(score / len(tile))
            tile_lens.append(len(tile))
        tile_scores = np.ma.asarray(tile_scores)
        tile_scores.harden_mask()
        tile_lens = np.asarray(tile_lens)

        # update tile scores with previously selected tiles
        for kmer in set(preselected_kmer_counts.keys()) & set(kmer_to_idxs.keys()):
            idxs = list(kmer_to_idxs[kmer])
            tile_scores.data[idxs] -= (
                preselected_kmer_counts[kmer] * dbg.nodes[kmer]["multiplicity"]
            ) / len(tile)

        # set number of tiles for this component
        if kmer_cov:
            num_component_tiles = ceil(
                len(component) * kmer_cov / (tile_size - kmer_size + 1)
            )
        if num_tiles:
            num_component_tiles = ceil(
                len(component) / kmers_remaining * tiles_remaining
            )
            kmers_remaining -= len(component)
            tiles_remaining -= num_component_tiles

        # choose tiles
        for _ in trange(num_component_tiles, desc="choosing tiles"):
            idx = tile_scores.argmax()
            tile_scores[idx] = np.ma.masked
            tile = candidate_tiles[idx]

            # write tile
            name, i, j = tile_to_name[tile][0]
            nterm = (
                "|NTERM" if dbg.nodes[tile[:kmer_size]].get("start_node", False) else ""
            )
            cterm = (
                "|CTERM" if dbg.nodes[tile[-kmer_size:]].get("end_node", False) else ""
            )
            print(f">{name}|{i}-{j}{nterm}{cterm}\n{tile}", file=output)

            # update tile scores
            for kmer in set(gen_kmers(tile, kmer_size)):
                idxs = list(kmer_to_idxs[kmer])
                tile_scores.data[idxs] -= (
                    dbg.nodes[kmer]["multiplicity"] / tile_lens[idxs]
                )