Exemplo n.º 1
0
    def __init__(self,
                 moltype,
                 invalid=-9,
                 alignment=None,
                 invalid_raises=False):
        super(_PairwiseDistance, self).__init__()
        moltype = get_moltype(moltype)
        if moltype.label not in self.valid_moltypes:
            name = self.__class__.__name__
            msg = (f"Invalid moltype for {name}: '{moltype.label}' not "
                   f"in {self.valid_moltypes}")
            raise ValueError(msg)

        self.moltype = moltype
        self.char_to_indices = get_moltype_index_array(moltype,
                                                       invalid=invalid)
        self._dim = len(list(moltype))
        self._dists = None
        self._dupes = None
        self._duped = None
        self._invalid_raises = invalid_raises

        self.names = None
        self.indexed_seqs = None

        if alignment is not None:
            self._convert_seqs_to_indices(alignment)

        self._func_args = []
Exemplo n.º 2
0
    def test_strand_symmetry(self):
        """correctly compute test of strand symmetry"""
        from cogent3 import get_moltype
        from cogent3.core.alignment import Aligned

        seq = DnaSequence("ACGGCTGAAGCGCTCCGGGTTTAAAACG")
        ssym = seq.strand_symmetry(motif_length=1)
        assert_allclose(ssym.observed.array, [[7, 5], [7, 9]])
        assert_allclose(ssym.expected.array, [[6, 6], [8, 8]])

        # RNA too
        seq = seq.to_rna()
        ssym = seq.strand_symmetry(motif_length=1)
        assert_allclose(ssym.observed.array, [[7, 5], [7, 9]])

        # Aligned
        seq = DnaSequence("ACGGCTGAAGCGCTCCGGGTTTAAAACG")
        m, s = seq.parse_out_gaps()
        seq = Aligned(m, s)
        ssym = seq.strand_symmetry(motif_length=1)
        assert_allclose(ssym.observed.array, [[7, 5], [7, 9]])

        with self.assertRaises(TypeError):
            text = get_moltype("text")
            m, s = text.make_seq(
                "ACGGCTGAAGCGCTCCGGGTTTAAAACG").parse_out_gaps()
            s.strand_symmetry(motif_length=1)

        # with motif_length=2
        seq = DnaSequence("AC GG CT GA AG CG CT CC GG GT TT AA AA CG".replace(
            " ", ""))
        ssym = seq.strand_symmetry(motif_length=2)
        self.assertLessEqual(len(ssym.observed.keys()), 8)
        assert_allclose(ssym.observed["AA"].to_array(), [2, 1])
        assert_allclose(ssym.observed["CC"].to_array(), [1, 2])
Exemplo n.º 3
0
 def test_align_to_ref_generic_moltype(self):
     """tests when the moltype is generic"""
     test_moltypes = ["text", "rna", "protein", "protein_with_stop", "bytes", "ab"]
     for test_moltype in test_moltypes:
         aligner = align_app.align_to_ref(moltype=test_moltype)
         self.assertEqual(aligner._moltype.label, test_moltype)
         self.assertEqual(
             aligner._kwargs["S"],
             make_generic_scoring_dict(10, get_moltype(test_moltype)),
         )
Exemplo n.º 4
0
    def __init__(self,
                 distance=None,
                 moltype=None,
                 fast_calc=None,
                 slow_calc=None):
        super(fast_slow_dist, self).__init__(
            input_types=ALIGNED_TYPE,
            output_types=(PAIRWISE_DISTANCE_TYPE, SERIALISABLE_TYPE),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        self._moltype = moltype if moltype is None else get_moltype(moltype)
        self._sm = None

        if (fast_calc or slow_calc) and distance:
            raise ValueError("cannot combine distance and fast/slow")

        if distance:
            fast_calc = distance
            slow_calc = distance

        d = set(["hamming", "paralinear", "logdet"]) & set(
            [slow_calc, fast_calc])
        if d and not self._moltype:
            raise ValueError(f"you must provide a moltype for {d}")

        try:
            fast_calc = get_distance_calculator(fast_calc,
                                                moltype=self._moltype)
        except (ValueError, AttributeError):
            fast_calc = None

        try:
            slow_calc = get_model(slow_calc)
        except ValueError:
            slow_calc = None

        if not (fast_calc or slow_calc):
            raise ValueError(f"invalid values for {slow_calc} or {fast_calc}")

        self.fast_calc = fast_calc
        if fast_calc and self._moltype and fast_calc.moltype != self._moltype:
            raise ValueError(
                f"{self._moltype} incompatible moltype with fast calculator {fast_calc.moltype}"
            )
        elif fast_calc:
            self._moltype = fast_calc.moltype

        if slow_calc and self._moltype and slow_calc.moltype != self._moltype:
            raise ValueError("incompatible moltype with slow calculator")
        elif slow_calc:
            self._moltype = slow_calc.moltype
        self._sm = slow_calc
Exemplo n.º 5
0
def make_generic_scoring_dict(match, mtype):
    """returns scoring dict for alignment

    Parameters
    ----------
    match : int
        value for a match, mismatches default to -1
    mtype
        MolType instance or string that can be used to get_moltype
    """
    from cogent3 import get_moltype

    mtype = get_moltype(mtype)
    S = {}
    for a in mtype:
        for b in mtype:
            if a == b:
                score = match
            else:
                score = -1
            S[a, b] = score
    return S
Exemplo n.º 6
0
    def __init__(self,
                 distance=None,
                 moltype=None,
                 fast_calc=None,
                 slow_calc=None):
        """
        Parameters
        ----------
        moltype : str
            cogent3 moltype
        distance : str
            Name of a distance method available as both fast and slow calculator.
        fast_calc
            Name of a fast distance calculator. See cogent3.available_distances().
        slow_calc
            Name of a slow distance calculator. See cogent3.available_models().

        Notes
        -----
        If you provide fast_calc or slow_calc, you must specify the moltype.
        """
        super(fast_slow_dist, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        self._moltype = moltype if moltype is None else get_moltype(moltype)
        self._sm = None

        if (fast_calc or slow_calc) and distance:
            raise ValueError("cannot combine distance and fast/slow")

        if distance:
            fast_calc = distance
            slow_calc = distance

        d = {"hamming", "percent", "paralinear", "logdet"
             } & {slow_calc, fast_calc}
        if d and not self._moltype:
            raise ValueError(f"you must provide a moltype for {d}")

        try:
            fast_calc = get_distance_calculator(fast_calc,
                                                moltype=self._moltype)
        except (ValueError, AttributeError):
            fast_calc = None

        try:
            slow_calc = get_model(slow_calc)
        except ValueError:
            slow_calc = None

        if not (fast_calc or slow_calc):
            raise ValueError(f"invalid values for {slow_calc} or {fast_calc}")

        self.fast_calc = fast_calc
        if fast_calc and self._moltype and fast_calc.moltype != self._moltype:
            raise ValueError(
                f"{self._moltype} incompatible moltype with fast calculator {fast_calc.moltype}"
            )
        elif fast_calc:
            self._moltype = fast_calc.moltype

        if slow_calc and self._moltype and slow_calc.moltype != self._moltype:
            raise ValueError("incompatible moltype with slow calculator")
        elif slow_calc:
            self._moltype = slow_calc.moltype
        self._sm = slow_calc
        self.func = self.calc_distance