def test_get_moltype(self): """correctly return a moltype by name""" for label in ("dna", "rna", "protein", "protein_with_stop"): mt = get_moltype(label) self.assertEqual(mt.label, label) mt = get_moltype(label.upper()) self.assertEqual(mt.label, label) mt = get_moltype(DNA) self.assertEqual(mt.label, "dna") with self.assertRaises(ValueError): _ = get_moltype("blah")
def test_strand_symmetry(self): """correctly compute test of strand symmetry""" from cogent3 import get_moltype from cogent3.core.alignment import Aligned seq = DnaSequence("ACGGCTGAAGCGCTCCGGGTTTAAAACG") ssym = seq.strand_symmetry(motif_length=1) assert_allclose(ssym.observed.array, [[7, 5], [7, 9]]) assert_allclose(ssym.expected.array, [[6, 6], [8, 8]]) # RNA too seq = seq.to_rna() ssym = seq.strand_symmetry(motif_length=1) assert_allclose(ssym.observed.array, [[7, 5], [7, 9]]) # Aligned seq = DnaSequence("ACGGCTGAAGCGCTCCGGGTTTAAAACG") m, s = seq.parse_out_gaps() seq = Aligned(m, s) ssym = seq.strand_symmetry(motif_length=1) assert_allclose(ssym.observed.array, [[7, 5], [7, 9]]) with self.assertRaises(TypeError): text = get_moltype("text") m, s = text.make_seq( "ACGGCTGAAGCGCTCCGGGTTTAAAACG").parse_out_gaps() s.strand_symmetry(motif_length=1) # with motif_length=2 seq = DnaSequence("AC GG CT GA AG CG CT CC GG GT TT AA AA CG".replace( " ", "")) ssym = seq.strand_symmetry(motif_length=2) self.assertLessEqual(len(ssym.observed.keys()), 8) assert_allclose(ssym.observed["AA"].to_array(), [2, 1]) assert_allclose(ssym.observed["CC"].to_array(), [1, 2])
def read(filepath): """returns MotifFreqsArray matrix""" try: infile = open(filepath) data = infile.readlines() infile.close() except TypeError: data = filepath data = [l.split() for l in data] revised = list(zip(*data)) states = [] matrix = [] for row in revised[1:]: states.append(row[0]) matrix.append([float(i) for i in row[1:]]) matrix = dict(zip(states, matrix)) if len(states) == 4: name = "rna" if "U" in states else "dna" else: name = "protein" states = list(get_moltype(name)) matrix = [matrix[s] for s in states] matrix = array(matrix, dtype=float) pfm = MotifFreqsArray(matrix.T, states) return pfm
def __init__(self, allowed_frac=0.99, motif_length=1, moltype=None): """ Parameters ---------- allowed_frac : float columns with a fraction of gap characters exceeding allowed_frac are excluded motif_length : int sequences split into non-overlapping tuples of this size. moltype : str molecular type, must be either DNA or RNA """ super(omit_gap_pos, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() if moltype: moltype = get_moltype(moltype) assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype" self.moltype = moltype self._allowed_frac = allowed_frac self._motif_length = motif_length self.func = self.omit
def __init__(self, moltype="dna", gc=1, allow_rc=False, trim_terminal_stop=True): """generates aa sequences Parameters ---------- moltype : str molecular type, must be either DNA or RNA gc identifier for a genetic code or a genetic code instance trim_terminal_stop : bool exclude terminal stop codon from seqs Returns ------- A sequence collection. Sequences that could not be translated are excluded. """ super(translate_seqs, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() moltype = get_moltype(moltype) assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype" self._moltype = moltype self._gc = get_code(gc) self._trim_terminal_stop = trim_terminal_stop self.func = self.get_translated
def read(filepath): """returns matrixid and MotifCountsArray matrix""" with open(filepath) as infile: matrix = [] states = [] for line in infile: line = line.strip() if line.startswith(">"): identifier = line[1:].split() elif line: line = _brackets.sub("", line) line = line.split() states.append(line.pop(0).upper()) matrix.append([int(i) for i in line]) matrix = dict(zip(states, matrix)) if len(states) == 4: name = "rna" if "U" in states else "dna" else: name = "protein" states = list(get_moltype(name)) matrix = array([matrix[s] for s in states], dtype=int).T pwm = MotifCountsArray(matrix, states) return identifier, pwm
def __init__(self, length, motif_length=1, subtract_degen=True, moltype=None): """ Parameters ---------- length : int only alignments with this length returned, False otherwise motif_length : int length is converted to modulo motif_length subtract_degen : bool degenerate characters subtracted from sequence length calculation moltype molecular type, can be string or instance """ super(min_length, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() if motif_length > 1: length = length // motif_length self._min_length = length self._motif_length = motif_length self.func = self.if_long_enough self._subtract_degen = subtract_degen if moltype: moltype = get_moltype(moltype) self._moltype = moltype
def test_roundtrip_alphabet(self): """alphabet to_json enables roundtrip""" dna = moltype.get_moltype("dna") data = dna.alphabet.to_json() got = deserialise_object(data) self.assertEqual(type(got), type(dna.alphabet)) self.assertEqual(list(got), list(dna.alphabet))
def make_unaligned_seqs( data, moltype=None, label_to_name=None, info=None, source=None, **kw ): """Initialize an unaligned collection of sequences. Parameters ---------- data sequences moltype the moltype, eg DNA, PROTEIN, 'dna', 'protein' label_to_name function for converting original name into another name. info a dict from which to make an info object source origins of this data, defaults to 'unknown' **kw other keyword arguments passed to SequenceCollection """ if moltype is not None: moltype = get_moltype(moltype) info = info or {} for other_kw in ("constructor_kw", "kw"): other_kw = kw.pop(other_kw, None) or {} kw.update(other_kw) assert isinstance(info, dict), "info must be a dict" info["source"] = source or "unknown" return SequenceCollection( data=data, moltype=moltype, label_to_name=label_to_name, info=info, **kw )
def __init__(self, quantile=None, gap_fraction=1, moltype="dna"): """Returns an alignment without the sequences responsible for exceeding disallowed_frac. Parameters ---------- quantile : float or None The number of gaps uniquely introduced by a sequence are counted. The value corresponding to quantile is determined and all sequences whose unique gap count is larger than this cutoff are excluded. If None, this condition is not applied. gap_fraction sequences whose proportion of gaps is >= this value are excluded, the default excludes sequences that are just gaps. moltype molecular type, can be string or instance """ super(omit_bad_seqs, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() if moltype: moltype = get_moltype(moltype) assert ( moltype.label.lower() in "dna rna protein protein_with_stop" ), "moltype must be one of DNA, RNA or PROTEIN" self._quantile = quantile self._gap_fraction = gap_fraction self._moltype = moltype self.func = self.drop_bad_seqs
def translate_frames(seq, moltype=None, gc=1, allow_rc=False): """translates a nucleic acid sequence Parameters ---------- moltype molecular type, must be either DNA or RNA gc identifer for a genetic code or a genetic code instance allow_rc : bool includes frames sequence reverse complement Returns ------- [(frame, translation), ..] Reverse complement frame numbers are negative """ gc = get_code(gc) if moltype: moltype = get_moltype(moltype) seq = moltype.make_seq(seq) translations = gc.sixframes(seq) if not allow_rc: translations = translations[:3] return translations
def __init__(self, moltype=None, gap_is_degen=True, motif_length=1): """excludes degenerate characters from alignment Parameters ---------- moltype : str molecular type, must be either DNA or RNA gap_is_degen : bool include gap character in degenerate character set motif_length : int sequences split into non-overlapping tuples of this size. If a tuple contains a degen character at any position the entire tuple is excluded """ super(omit_degenerates, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() if moltype: moltype = get_moltype(moltype) assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype" self.moltype = moltype self._no_degen = omit_degenerates self._allow_gap = not gap_is_degen self._motif_length = motif_length self.func = self.filter_degenerates
def __init__(self, moltype=None, format="fasta"): """ Parameters ---------- moltype molecular type, string or instance format : str sequence file format """ super(ComposableSeq, self).__init__( input_types=None, output_types=("sequences", "serialisable"), data_types=( "DataStoreMember", "str", "Path", "ArrayAlignment", "Alignment", "SequenceCollection", ), ) _seq_loader.__init__(self) self._formatted_params() if moltype: moltype = get_moltype(moltype) self.moltype = moltype self._parser = PARSERS[format.lower()]
def __init__( self, *positions, fourfold_degenerate=False, gc="Standard Nuclear", moltype="dna", ): """selects the indicated codon positions from an alignment Parameters ---------- positions either an integer (1, 2, 3), or a tuple of position numbers, e.g. 3 is third position, (1,2) is first and second codon position fourfold_degenerate : bool if True, returns third positions from four-fold degenerate codons. Overrides positions. gc identifer for a genetic code or a genetic code instance moltype : str molecular type, must be either DNA or RNA """ super(take_codon_positions, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() assert moltype is not None moltype = get_moltype(moltype) assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype" self._moltype = moltype self._four_fold_degen = fourfold_degenerate self._fourfold_degen_sets = None if fourfold_degenerate: gc = get_code(gc) sets = get_fourfold_degenerate_sets( gc, alphabet=moltype.alphabet, as_indices=True ) self._fourfold_degen_sets = sets self.func = self.take_fourfold_positions return assert ( 1 <= min(positions) <= 3 and 1 <= max(positions) <= 3 ), "Invalid codon positions" by_index = True if len(positions) == 1 else False if by_index: positions = positions[0] - 1 self.func = self.take_codon_position else: positions = tuple(p - 1 for p in sorted(positions)) self.func = self.take_codon_positions self._positions = positions
def test_count_ab(self): """abseq array seq should count characters""" AB = get_moltype("ab") seq = AB.make_array_seq("aaba-", alphabet=AB.alphabet.with_gap_motif()) c = seq.counts() self.assertEqual(c.to_dict(), {"a": 3, "b": 1}) c = seq.counts(allow_gap=True) self.assertEqual(c.to_dict(), {"a": 3, "b": 1, "-": 1})
def __init__(self, length, start=0, random=False, seed=None, motif_length=1, moltype=None): """ Parameters ---------- length : int only alignments with this length returned, False otherwise start integer starting position for truncation, or 'random' in which case a random start is chosen (within the possible range returning an alignment of the specified length). Overrides `random`. random : bool random positions for the corresponding tuple are chosen. seed : int random number seed motif_length : int length of sequence units to consider. If not 1, length and start are converted (reduced) if necessary to be modulo motif_length moltype molecular type, can be string or instance """ super(fixed_length, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() diff = length % motif_length if diff != 0: length -= diff assert length % motif_length == 0 self._length = length self._motif_length = motif_length if moltype: moltype = get_moltype(moltype) self._moltype = moltype if type(start) == str: assert start.lower().startswith("rand") random = False else: assert type(start) == int assert start >= 0 diff = start % motif_length if diff != 0: start -= diff self._start = _GetStart(start) if seed: np_random.seed(seed) self.func = {False: self.truncated}.get(random, self.sample_positions)
def test_roundtrip_seq(self): """seq to_json enables roundtrip""" for mtype in ("dna", "protein"): mtype = moltype.get_moltype(mtype) seq = mtype.make_seq("ACGGTCGG", "label", info={"something": 3}) got = deserialise_object(seq.to_json()) self.assertEqual(got.info.something, 3) self.assertEqual(got.name, "label") self.assertEqual(got.moltype, seq.moltype) self.assertEqual(str(got), str(seq))
def deserialise_moltype(data): """returns a cogent3 MolType instance, or a CodonAlphabet""" data.pop("version", None) label = data["moltype"] data["moltype"] = get_moltype(label) klass = _get_class(data.pop("type")) if klass == _CodonAlphabet: gc = get_code(data.pop("genetic_code")) result = _CodonAlphabet(**data) result._gc = gc else: result = data["moltype"] return result
def deserialise_alphabet(data): """returns a cogent3 Alphabet instance""" data.pop("version", None) if _get_class(data.get("type")) == _CodonAlphabet: result = deserialise_moltype(data) return result label = data["moltype"] data["moltype"] = get_moltype(label) key = "data" if "data" in data else "motifset" motifs = data.pop(key) klass = _get_class(data.pop("type")) result = klass(motifs, **data) return result
def test_jaspar(self): """correctly load jaspar formatted counts matrix""" path = "data/sample.jaspar" mid, pwm = jaspar.read(path) assert mid == ["PSSMid", "HGNCsymbol"], "ID line wrong" # note state indices are ordered by moltype base_order = list(get_moltype("dna")) expect = [ [35, 374, 30, 121, 6, 121, 33], [0, 10, 0, 0, 3, 2, 44], [352, 3, 354, 268, 360, 222, 155], [2, 2, 5, 0, 10, 44, 157], ] assert_array_equal(pwm.array, array(expect).T) self.assertEqual(pwm[0, "A"], 352) self.assertEqual(pwm[3, "T"], 121)
def __init__( self, ref_seq="longest", score_matrix=None, insertion_penalty=20, extension_penalty=2, moltype="dna", ): """ Parameters ---------- ref_seq : str either a name to be found in the data, or 'longest'. If latter, the longest sequence will be chosen as the reference score_matrix scoring dict for DNA, defaults to `make_dna_scoring_dict(10, -1, -8)` insertion_penalty penalty for gap insertion extension_penalty penalty for gap extension moltype : str molecular type """ super(align_to_ref, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() assert moltype moltype = get_moltype(moltype) self._moltype = moltype S = score_matrix or ( make_dna_scoring_dict(10, -1, -8) if self._moltype.label == "dna" else make_generic_scoring_dict(10, self._moltype) ) self._kwargs = dict( S=S, d=insertion_penalty, e=extension_penalty, return_score=False ) if ref_seq.lower() == "longest": self.func = self.align_to_longest else: self.func = self.align_to_named_seq self._ref_name = ref_seq self._gap_state = None # can be character or int, depends on aligner
def Sequence(moltype=None, seq=None, name=None, filename=None, format=None): if seq is None: for (a_name, a_seq) in FromFilenameParser(filename, format): if seq is None: seq = a_seq if name is None: name = a_name else: raise ValueError("Multiple sequences in '%s'" % filename) if moltype is not None: moltype = get_moltype(moltype) seq = moltype.make_seq(seq) elif not hasattr(seq, "moltype"): seq = ASCII.make_seq(seq) if name is not None: seq.name = name return seq
def __init__(self, moltype="dna", gc=1, allow_rc=False, trim_terminal_stop=True): """selects translatable sequences Sequences are truncated to modulo 3. seqs.info has a translation_errors entry. Parameters ---------- moltype : str molecular type, must be either DNA or RNA gc identifier for a genetic code or a genetic code instance allow_rc : bool If False, forward strand considered only. If True, and best frame on rc, it will be negative trim_terminal_stop : bool exclude terminal stop codon from seqs Returns ------- A sequence collection. Sequences that could not be translated are excluded. """ super(select_translatable, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() moltype = get_moltype(moltype) assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype" self._moltype = moltype self._gc = get_code(gc) self._allow_rc = allow_rc self._trim_terminal_stop = trim_terminal_stop self.func = self.get_translatable
def __init__(self, moltype=None, format="fasta"): """ Parameters ---------- moltype molecular type, string or instance format : str sequence file format """ super(ComposableSeq, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) _seq_loader.__init__(self) self._formatted_params() if moltype: moltype = get_moltype(moltype) self.moltype = moltype self._parser = PARSERS[format.lower()]
def __init__(self, moltype=None, format="fasta"): """ Parameters ---------- moltype molecular type, string or instance format : str sequence file format """ super(ComposableSeq, self).__init__( input_types=None, output_types=(SEQUENCE_TYPE, SERIALISABLE_TYPE), data_types=("DataStoreMember", "str", "Path"), ) _seq_loader.__init__(self) self._formatted_params() if moltype: moltype = get_moltype(moltype) self.moltype = moltype self._parser = PARSERS[format.lower()]
def __init__(self, mask_degen=False, choose="longest", seed=None, moltype=None): """Returns unique sequences, adds 'dropped' key to seqs.info Parameters ---------- mask_degen if True, degenerate characters are ignored choose choose a representative from sets of duplicated sequences. Valid values are None (all members of a duplicated set are excluded), 'longest', 'random'. seed : int set random number seed. Only applied of choose=='random' moltype molecular type, can be string or instance """ super(omit_duplicated, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) assert not choose or choose in "longestrandom" self._formatted_params() if moltype: moltype = get_moltype(moltype) self._moltype = moltype if choose == "random" and seed: np_random.seed(seed) self._mask_degen = mask_degen if choose == "longest": self.func = self.choose_longest elif choose == "random": self.func = self.choose_random else: self.func = self.take_unique
def make_seq(seq, name=None, moltype=None): """ Parameters ---------- seq : str raw string to be converted to sequence object name : str sequence name moltype name of a moltype or moltype instance Returns ------- returns a sequence object """ moltype = moltype or "text" moltype = get_moltype(moltype) seq = moltype.make_seq(seq) if name is not None: seq.name = name return seq
def deserialise_seq(data, aligned=False): """deserialises sequence and any annotations Parameters ---------- data : dict a result of json.loads of a to_rich_dict() aligned whether sequence type is for an Alignment, in which case an Aligned instance will be returned Returns ------- """ from cogent3.core.moltype import get_moltype data.pop("version", None) data["moltype"] = get_moltype(data.pop("moltype")) annotations = data.pop("annotations", None) make_seq = data["moltype"].make_seq type_ = data.pop("type") klass = _get_class(type_) if "-" in data["seq"]: aligned = True data.pop("moltype") result = make_seq(**data) if aligned: map_, result = result.parse_out_gaps() if annotations: deserialise_annotation(annotations, result) if aligned: result = Aligned(map_, result) return result
def deserialise_seq_collections(data): """returns a cogent3 sequence/collection/alignment instance""" # We first try to load moltype/alphabet using get_moltype from cogent3.core.moltype import get_moltype data.pop("version", None) data["moltype"] = get_moltype(data.pop("moltype")) annotations = data.pop("annotations", None) type_ = data.pop("type") klass = _get_class(type_) assert "alignment" in type_.lower(), "not alignment type" aligned = not type_.endswith("SequenceCollection") seqs = [] for v in data.pop("seqs").values(): v["moltype"] = data["moltype"] seq = deserialise_seq(v, aligned=aligned) seqs.append(seq) result = klass(seqs, **data) if annotations: deserialise_annotation(annotations, result) return result
def __init__( self, seq1, seq2, moltype="text", window=20, threshold=None, min_gap=0, rc=False, xtitle=None, ytitle=None, title=None, width=500, show_progress=False, ): """ Parameters ---------- seq1, seq2 : string or sequence object moltype : str or MolType instance if seq1, seq2 are strings, moltype is used to convert to sequence objects window : int k-mer size for comparison between sequences threshold : int windows where the sequences are identical >= threshold are a match min_gap : int permitted gap for joining adjacent line segments, default is no gap joining rc : bool or None include dotplot of reverse compliment also. Only applies to Nucleic acids moltypes xtitle, ytitle name of the seq1, seq2. None if included as part of a AnnotatedDrawable title : str title for the plot show_progress : bool displays progress bar """ from cogent3.core.alignment import Aligned # we ensure sequences have gaps parsed and the calculate aspect ratio if hasattr(seq1, "moltype"): moltype = seq1.moltype else: moltype = get_moltype(moltype) is_aligned = isinstance(seq1, Aligned) and isinstance(seq2, Aligned) map1, seq1 = _convert_input(seq1, moltype) map2, seq2 = _convert_input(seq2, moltype) len1, len2 = len(seq1), len(seq2) height = width * len2 / len1 super(Dotplot, self).__init__(visible_axes=True, showlegend=True, width=width, height=height) self.seq1 = seq1 self.seq2 = seq2 self._aligned_coords = get_align_coords(map1, map2, aligned=is_aligned) self.xtitle = xtitle self.ytitle = ytitle self.title = title self._window = window self._min_gap = min_gap if threshold is None: universe = (len1 - window) * (len2 - window) acceptable_noise = min(len1, len2) / window threshold = suitable_threshold(window, acceptable_noise / universe) self._threshold = threshold fwd, rev = get_dotplot_coords( self.seq1, self.seq2, window=window, threshold=threshold, min_gap=min_gap, rc=rc, show_progress=show_progress, ) self._fwd = fwd self._rev = rev