def nw_metric(self, s1, s2): """May or may not produce a true metric. Details in: E. Halpering, J. Buhler, R. Karp, R. Krauthgamer, and B. Westover. Detecting protein sequence conservation via metric embeddings. Bioinformatics, 19(Suppl. 1):i122–i129, 2003""" xx = parasail.nw_stats(s1, s1, open=3, extend=3, matrix=parasail.blosum62).score yy = parasail.nw_stats(s2, s2, open=3, extend=3, matrix=parasail.blosum62).score xy = parasail.nw_stats(s1, s2, open=3, extend=3, matrix=parasail.blosum62).score D = xx + yy - 2 * xy return D
def nw_metric(self, s1, s2): """May or may not produce a true metric. Details in: E. Halpering, J. Buhler, R. Karp, R. Krauthgamer, and B. Westover. Detecting protein sequence conservation via metric embeddings. Bioinformatics, 19(Suppl. 1):i122–i129, 2003""" xx = parasail.nw_stats(s1, s1, open=3, extend=3, matrix=parasail.blosum62).score yy = parasail.nw_stats(s2, s2, open=3, extend=3, matrix=parasail.blosum62).score xy = parasail.nw_stats(s1, s2, open=3, extend=3, matrix=parasail.blosum62).score D = xx + yy - 2 * xy return D
def nw_hamming_metric(s1, s2, matrix='blosum62', open=3, extend=3): """Function applying Parasail's Needleman-Wuncsh Algorithm to align and compute a Hamming Distance between any two sequences: number of mismatched positions. Gaps count as a mismatch. Penalties and matrix are used for alignment purposes, not in the distance calculation. Parameters ---------- s1: string string containing amino acid letters s2: string string containing amino acid letters matrix : str Attribute of parasail that names a substitution matrix Returns ------- D : float distance between strings (Hamming Distance: number of mismatched positions) Notes ----- .. code-block:: python xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix) hamming_distance = len(xy_t.traceback.comp)-xy.matches return hamming_distance""" p_matrix = getattr(parasail, matrix) xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=p_matrix) D = len(xy_t.traceback.comp)-xy.matches return D
def _try_cache(self, e): try: xx = self.sim_cache[(e, e)] except KeyError: xx = parasail.nw_stats(e, e, **self.paraParams).score self.sim_cache[(e, e)] = xx return xx
def _try_cache(self, e): try: xx = self.sim_cache[(e, e)] except KeyError: xx = parasail.nw_stats(e, e, **self.paraParams).score self.sim_cache[(e, e)] = xx return xx
def nw_metric(s1, s2, matrix='blosum62', open=3, extend=3, return_similarity=False): """Function applying Parasail's Needleman-Wuncsh Algorithm to compute a distance between any two sequences. Parameters ---------- s1: string string containing amino acid letters s2: string string containing amino acid letters matrix : str Attribute of parasail that names a substitution matrix Returns ------- D : float distance via reciprocal alignment scores. Notes ----- .. code-block:: python xx = parasail.nw_stats(s1, s1, open=open, extend=extend, matrix=matrix).score yy = parasail.nw_stats(s2, s2, open=open, extend=extend, matrix=matrix).score xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix).score D = xx + yy - 2 * xy return D May or may not produce a true metric. Details in: E. Halpering, J. Buhler, R. Karp, R. Krauthgamer, and B. Westover. Detecting protein sequence conservation via metric embeddings. Bioinformatics, 19 (sup 1) 2003 """ p_matrix = getattr(parasail, matrix) xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix).score if return_similarity: return xy else: xx = parasail.nw_stats(s1, s1, open=open, extend=extend, matrix=p_matrix).score yy = parasail.nw_stats(s2, s2, open=open, extend=extend, matrix=p_matrix).score D = xx + yy - 2 * xy return D
def calculate(self, query_base_sequence, target_base_sequence, gap_open=16, gap_extend=4): result = parasail.nw_stats(query_base_sequence, target_base_sequence, gap_open, gap_extend, parasail.dnafull) score = result.similar * 1.0 / result.length return score
def metric(self, i1, i2): """sklearn specifies that function will receive two rows as parameters and return one value as distance""" xx = self._try_cache(self.i2e[i1[0]]) yy = self._try_cache(self.i2e[i2[0]]) """Don't need to cache the xy similarity because it doesn't have other uses""" xy = parasail.nw_stats(self.i2e[i1[0]], self.i2e[i2[0]], **self.paraParams).score D = xx + yy - 2 * xy return D
def metric(self, i1, i2): """sklearn specifies that function will receive two rows as parameters and return one value as distance""" xx = self._try_cache(self.i2e[i1[0]]) yy = self._try_cache(self.i2e[i2[0]]) """Don't need to cache the xy similarity because it doesn't have other uses""" xy = parasail.nw_stats(self.i2e[i1[0]], self.i2e[i2[0]], **self.paraParams).score D = xx + yy - 2 * xy return D
def hm_metric(s1, s2, matrix='blosum62', open=3, extend=3): """ Function applying Parasail's Needleman-Wuncsh Algorithm to allign and get a Hamming Distance between any two sequences: number of mismatched positions Parameters ---------- s1: string string containing amino acid letters s2: string string containing amino acid letters Returns ------- D : float distance between strings (Hamming Distance: number of mismatched positions) Notes ----- .. code-block:: python xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix) hamming_distance = len(xy_t.traceback.comp)-xy.matches return hamming_distance """ p_matrix = getattr(parasail, matrix) xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=p_matrix) D = len(xy_t.traceback.comp) - xy.matches return D
def hm_matches(s1, s2, matrix=parasail.blosum62, open=3, extend=3): xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix) return xy.matches
def aligned_mm_metric(s1, s2, open=3, extend=3, matrix=parasail.blosum62): res = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) return res.length - res.matches