def dbgtilesummary(dbg_path, tiles, orfs, output): """Compute summary statistics on a de Bruin graph (DBG) The output is written out as a YAML. This operation computes statistics of an ORF set and a tile set relative to a DBG, which is why it requires those files too. """ try: import networkx as nx from pepsyn.dbg import dbg_stats except ImportError: print("dbgtilesummary requires NetworkX", file=sys.stderr) raise Abort() dbg = nx.read_gpickle(dbg_path) with open(orfs, "r") as ip: orfs = [seq for (name, seq, qual) in readfq(ip)] with open(tiles, "r") as ip: tiles = [seq for (name, seq, qual) in readfq(ip)] stats = dbg_stats(dbg, orfs, tiles) print(yaml.dump(stats), file=output)
def tilesummary(tiles, orfs, output): """Compute summary statistics on a set of peptide tiles These statistics are computed relative to a set of ORFs. This operation can be used on raw or cleaned ORFs. """ with open(orfs, "r") as ip: orfs = {name: seq for (name, seq, qual) in readfq(ip)} with open(tiles, "r") as ip: tiles = {name: seq for (name, seq, qual) in readfq(ip)} stats = tile_stats(orfs, tiles) print(yaml.dump(stats), file=output)
def disambiguateaa(input, output): """Replace IUPAC ambiguous amino acids with unambiguous ones Specifically, make the following replacements: B => DN X => ACDEFGHIKLMNPQRSTVWY Z => EQ J => LI, U => C (selenocysteine) O => K (pyrrolysine) If there are multiple possible replacements, this operation will output a sequence for each possible option. Use caution with sequences that are highly ambiguous (e.g., with many Xs), as in this case a single sequence could lead to an explosion in the output. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, ambig, qual) in readfq(input): n = num_disambiguated_iupac_aa(ambig) digits = floor(log10(n)) + 1 fmt = f"{name}|disambig_{{:0{digits}d}}" for (i, unambig) in enumerate(disambiguate_iupac_aa(ambig)): if n > 1: name = fmt.format(i + 1) print(f">{name}\n{unambig}", file=output)
def fasta_handle_to_dbg(fasta_handle, k, tqdm=None, ignore_short=False): return name_seq_pairs_to_dbg( ((name, seq) for (name, seq, qual) in readfq(fasta_handle)), k, tqdm, ignore_short, )
def suffix(input, output, suffix): """Add a suffix to each sequence INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, seq, qual) in readfq(input): newseq = seq + suffix print(f">{name}\n{newseq}", file=output)
def clip(input, output, left, right): """Clip/truncate bases from the ends of each sequence INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, seq, qual) in readfq(input): stop = len(seq) - right print(f">{name}\n{seq[left:stop]}", file=output)
def filterstop(input, output): """Filter out input sequences that contain stop codons (*). INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, seq, qual) in readfq(input): if "*" in seq: continue print(f">{name}\n{seq}", file=output)
def orfsummary(input, output): """Compute summary statistics on a set of ORFs Can be used on raw or cleaned orfs. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ orfs = {name: seq for (name, seq, qual) in readfq(input)} stats = orf_stats(orfs) print(yaml.dump(stats), file=output)
def stripstop(input, output): """Strip stop "codons" from end of input protein sequences. Stop codons are assumed to be represented as "*". INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, seq, qual) in readfq(input): seq = seq.rstrip("*") print(f">{name}\n{seq}", file=output)
def x2ggsg(input, output): """Replace stretches of Xs with Serine-Glycine linker (in a GGSG pattern) INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, seq, qual) in readfq(input): replacement = x_to_ggsg(seq) if replacement != seq: output_title = f"{name}|withGSlinker" else: output_title = name print(f">{output_title}\n{replacement}", file=output)
def filterlen(input, output, min_len, max_len): """Filter sequences of a given length. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ if max_len is None: max_len = inf for (name, seq, qual) in readfq(input): if len(seq) < min_len: continue if len(seq) > max_len: continue print(f">{name}\n{seq}", file=output)
def uniq(input, output): """Filter out duplicate sequences. Only takes account of the sequences themselves. Arbitrarily picks one. This requires loading the entire file into RAM. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ seqs = {} for (name, seq, qual) in readfq(input): if seq not in seqs: seqs[seq] = name for (seq, name) in seqs.items(): print(f">{name}\n{seq}", file=output)
def pad(input, output, length, nterm): """Pad protein sequence to a specified length by adding amino acids in the pattern of "GSGG". INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ terminus = "N" if nterm else "C" for (name, seq, qual) in readfq(input): padded = pad_ggsg(seq, length, terminus) pad_len = len(padded) - len(seq) if pad_len > 0: output_title = f"{name}|{terminus}-PADDED-{pad_len}" else: output_title = name print(f">{output_title}\n{padded}", file=output)
def tile(input, output, length, overlap): """Generate short (overlapping) sequence tiles from input sequences. Each sequence in the fasta input is converted into short tiles with given length and overlap and written out in fasta format. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. Note: this tool drops "incomplete/short" last tiles if the length/overlap setting does not allow a tiling to perfectly cover a sequence. We recommend using ``ctermpep`` to explicitly capture the last tiles. """ for (name, seq, qual) in tqdm(readfq(input), desc="tile", unit="seq"): for (start, end, t) in tile_op(seq, length, overlap): output_title = f"{name}|{start}-{end}" print(f">{output_title}\n{t}", file=output)
def ctermpep(input, output, length, add_stop): """Extract the C-terminal peptide from each input sequence If an input sequence is shorter than the specified length, it will write out the entirety of the sequence. With the ``--add-stop`` option, an asterisk is appended to the input sequence and counts as one of the amino acids in terms of peptide length. For example, if requesting 56-aa peptides with a stop codon, the output will code for 55 amino acids and the stop. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, seq, qual) in tqdm(readfq(input), desc="ctermpep", unit="seq"): oligo = cterm_oligo(seq, length, add_stop=add_stop) output_title = f"{name}|CTERM" if add_stop: output_title = f"{output_title}|STOP" print(f">{output_title}\n{oligo}", file=output)
def findsite(input, site, clip_left, clip_right): """Find locations of a site in a DNA sequences If a sequence matches the specified site, write out its name and location. Used as a diagnostic to confirm that a particular DNA site (e.g., restriction enzyme) is absent from a set of sequences. Because there may be adaptor sequences that contain such a site by design, the clipping option allows the search to be restricted. Note that a site is searched if it overlaps with the valid region even by one base (i.e., a site can match if it is mostly outside the clipped region, as long as it overlaps the target search region). INPUT is a path to fasta file or "-" to specify STDIN. """ query = str(site2dna(site)) for (name, seq, qual) in readfq(input): start = clip_left end = len(seq) - clip_right idx = seq[start:end].find(query) if idx >= 0: print(f"{name}|{site}|{idx + start}", file=sys.stdout)
def greedykmercov( input, output, tile_size, dbg_path, kmer_cov, num_tiles, preselected_tiles_path ): """Select protein tiles from de Bruijn graph by maximizing k-mer coverage Each tile is a fragment of an observed input ORF. Either the total number of output tiles can be specified, or the average target k-mer coverage. If there is already a pre-selected set of tiles chosen through some other method, specifying them will initialize the de Bruijn graph to reflect the preexisting k-mer coverage. NOTE: ORFS shorter than tile-size are sampled, but ORFs shorter than kmer-size are ignored. (Use pepsyn filterlen to select short tiles.) INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ # test input/context try: import networkx as nx from pepsyn.dbg import gen_kmers, setreduce_attr, sum_attr except ImportError: raise Abort("greedykmercov requires NetworkX") try: import numpy as np except ImportError: raise Abort("greedykmercov requires NumPy") if kmer_cov and num_tiles: raise UsageError("Set -c/--kmer-cov OR -n/--num-tiles but not both") if not kmer_cov and not num_tiles: raise UsageError("Must set one of -c/--kmer-cov OR -n/--num-tiles") # load orfs orfs = {name: seq for (name, seq, qual) in readfq(input)} # load dbg dbg = nx.read_gpickle(dbg_path) kmer_size = len(next(iter(dbg))) if kmer_size > tile_size: raise UsageError("kmer-size > tile_size") kmers_remaining = len(dbg) num_components = nx.number_weakly_connected_components(dbg) if num_tiles: tiles_remaining = num_tiles # load preselected tiles preselected_tiles = [seq for (name, seq, qual) in readfq(preselected_tiles_path)] preselected_kmer_counts = Counter( [ kmer for tile in preselected_tiles for kmer in gen_kmers(tile, kmer_size, yield_short=True) ] ) # process each graph component separately component_iter = tqdm( nx.weakly_connected_components(dbg), unit="comp", desc="dbg components", total=num_components, ) for component in component_iter: component_orfs = setreduce_attr(dbg, component, "orf") # generate all candidate tiles tile_to_name = {} for name in tqdm(component_orfs, desc="generating tiles"): # special case short orfs if len(orfs[name]) < tile_size: tile_to_name.setdefault(orfs[name], []).append( (name, 0, len(orfs[name])) ) for (i, j, tile) in tile_op(orfs[name], tile_size, tile_size - 1): tile_to_name.setdefault(tile, []).append((name, i, j)) candidate_tiles = list(tile_to_name.keys()) # generate init tile scores tile_scores = [] tile_lens = [] kmer_to_idxs = {} for idx, tile in enumerate(tqdm(candidate_tiles, desc="init tile scores")): score = 0 for kmer in set(gen_kmers(tile, kmer_size)): score += dbg.nodes[kmer]["multiplicity"] kmer_to_idxs.setdefault(kmer, set()).add(idx) tile_scores.append(score / len(tile)) tile_lens.append(len(tile)) tile_scores = np.ma.asarray(tile_scores) tile_scores.harden_mask() tile_lens = np.asarray(tile_lens) # update tile scores with previously selected tiles for kmer in set(preselected_kmer_counts.keys()) & set(kmer_to_idxs.keys()): idxs = list(kmer_to_idxs[kmer]) tile_scores.data[idxs] -= ( preselected_kmer_counts[kmer] * dbg.nodes[kmer]["multiplicity"] ) / len(tile) # set number of tiles for this component if kmer_cov: num_component_tiles = ceil( len(component) * kmer_cov / (tile_size - kmer_size + 1) ) if num_tiles: num_component_tiles = ceil( len(component) / kmers_remaining * tiles_remaining ) kmers_remaining -= len(component) tiles_remaining -= num_component_tiles # choose tiles for _ in trange(num_component_tiles, desc="choosing tiles"): idx = tile_scores.argmax() tile_scores[idx] = np.ma.masked tile = candidate_tiles[idx] # write tile name, i, j = tile_to_name[tile][0] nterm = ( "|NTERM" if dbg.nodes[tile[:kmer_size]].get("start_node", False) else "" ) cterm = ( "|CTERM" if dbg.nodes[tile[-kmer_size:]].get("end_node", False) else "" ) print(f">{name}|{i}-{j}{nterm}{cterm}\n{tile}", file=output) # update tile scores for kmer in set(gen_kmers(tile, kmer_size)): idxs = list(kmer_to_idxs[kmer]) tile_scores.data[idxs] -= ( dbg.nodes[kmer]["multiplicity"] / tile_lens[idxs] )