예제 #1
0
파일: dbdiff.py 프로젝트: ressy/IgDiscover
def compare(a, b):
    """Return cost of comparing a to b"""

    l = min(len(a.sequence), len(b.sequence))
    length_diff = max(len(a.sequence), len(b.sequence)) - l
    dist_prefixes = hamming_distance(a.sequence[:l], b.sequence[:l])
    dist_suffixes = hamming_distance(a.sequence[-l:], b.sequence[-l:])

    return 5 * min(dist_prefixes, dist_suffixes) + length_diff
예제 #2
0
 def naive_has_similar(self, s, distance):
     for t in self.strings:
         if len(t) != len(s):
             continue
         if hamming_distance(t, s) <= distance:
             return True
     return False
예제 #3
0
def is_similar_with_junction(s, t, mismatches, cdr3_core):
    """
    Return whether strings s and t have at most the given number of mismatches
    *and* have at least one identical junction.
    """
    # TODO see issue #81
    if len(s) != len(t):
        return False
    if 0 < mismatches < 1:
        delta = cdr3_core.start if cdr3_core is not None else 0
        distance_ok = hamming_distance(s, t) <= (len(s) - delta) * mismatches
    else:
        distance_ok = hamming_distance(s, t) <= mismatches
    if cdr3_core is None:
        return distance_ok
    return distance_ok and ((s[:cdr3_core.start] == t[:cdr3_core.start]) or
                            (s[cdr3_core.stop:] == t[cdr3_core.stop:]))
예제 #4
0
 def is_similar(s, t):
     # m = max_hamming
     if '-' in s or '-' in t:
         # Remove suffix and/or prefix where sequences do not overlap
         s = s.lstrip('-')
         t = t[-len(s):]
         s = s.rstrip('-')
         if len(s) < min_overlap:
             return False
         t = t[:len(s)]
         # TODO allowed Hamming distance should be reduced relative to the overlap length
         # m = max_hamming * len(s) / len(original_length_of_s)
     return hamming_distance(s, t) <= max_hamming
예제 #5
0
파일: dbdiff.py 프로젝트: ressy/IgDiscover
def print_similar(a, b, colored: bool):
    l = min(len(a.sequence), len(b.sequence))
    dist_prefixes = hamming_distance(a.sequence[:l], b.sequence[:l])
    dist_suffixes = hamming_distance(a.sequence[-l:], b.sequence[-l:])
    if dist_prefixes <= dist_suffixes:
        a_prefix = ''
        b_prefix = ''
        a_common = a.sequence[:l]
        b_common = b.sequence[:l]
        a_suffix = a.sequence[l:]
        b_suffix = b.sequence[l:]
    else:
        a_prefix = a.sequence[:-l]
        b_prefix = b.sequence[:-l]
        a_common = a.sequence[-l:]
        b_common = b.sequence[-l:]
        a_suffix = ''
        b_suffix = ''

    s = format_indel(a_prefix, b_prefix, colored)
    edits = []
    for i, (ac, bc) in enumerate(zip(a_common, b_common)):
        if ac != bc:
            if colored:
                s = '{' + red(ac) + ' → ' + green(bc) + '}'
            else:
                s = '{' + ac + ' → ' + bc + '}'
            edits.append(s)
        else:
            edits.append(ac)
    s += ''.join(edits)

    s += format_indel(a_suffix, b_suffix, colored)

    print('~', a.name, '--', b.name)
    print(s)
    print()
예제 #6
0
파일: parse.py 프로젝트: ressy/IgDiscover
    def _aa_mutations(self):
        # Earlier versions of this code used edit distance to compute the number of mutations,
        # but some FR1 alignments are reported with a frameshift by IgBLAST. By requiring that
        # reference and query lengths are identical, we can filter out these cases (and use
        # Hamming distance to get some speedup)
        if (not self.aa_reference or not self.aa_sequence
                or len(self.nt_sequence) != len(self.nt_reference)):
            return None
        dist = hamming_distance(self.aa_reference, self.aa_sequence)

        # If the mutation rate is still obviously too high, assume something went
        # wrong and ignore the computed value
        if dist / len(self.aa_reference) >= 0.8:
            return None
        return dist
예제 #7
0
def set_aa_mut_columns(record, database):
    """
    Compute amino acid mutation rate for all regions on V and also for V
    itself as the sum of the regions (that is, excluding the CDR3)
    """
    total_length = 0
    total_dist = 0
    n_regions = 0
    for airr_col, region in (
        ("fwr1", "FR1"),
        ("cdr1", "CDR1"),
        ("fwr2", "FR2"),
        ("cdr2", "CDR2"),
        ("fwr3", "FR3"),
    ):
        record[region + "_aa_mut"] = None
        start = record[airr_col + "_start"]
        end = record[airr_col + "_end"]
        if start is None or end is None:
            continue
        sequence_aa = nt_to_aa(record["sequence"][start - 1 : end])
        germline_aa = database.v_regions_aa[record["v_call"]].get(region)
        if germline_aa is None:
            continue
        # Some FR1 alignments are reported with a frameshift by IgBLAST. By requiring that
        # reference and query lengths are identical, we can filter out these cases (and use
        # Hamming distance to get some speedup)
        if len(germline_aa) != len(sequence_aa):
            continue
        dist = hamming_distance(germline_aa, sequence_aa)
        mut_aa = dist / len(germline_aa)
        if mut_aa >= 0.8:
            # assume something went wrong
            continue
        total_dist += dist
        n_regions += 1
        total_length += len(germline_aa)
        record[region + "_aa_mut"] = 100.0 * mut_aa
    if n_regions == 5:
        record["V_aa_mut"] = 100.0 * total_dist / total_length
    else:
        record["V_aa_mut"] = None
예제 #8
0
 def naive_find_all_similar(self, s, distance):
     for t in self.strings:
         if len(t) != len(s):
             continue
         if hamming_distance(t, s) <= distance:
             yield t
예제 #9
0
 def naive_has_similar(t):
     for s in strings:
         if hamming_distance(s, t) <= dist:
             return True
     return False
예제 #10
0
 def linked(s, t):
     return hamming_distance(s, t) <= mismatches
예제 #11
0
def test_hamming_distance_incorrect_length():
    with pytest.raises(IndexError):
        hamming_distance('A', 'BC')
예제 #12
0
def test_hamming_distance():
    assert hamming_distance('', '') == 0
    assert hamming_distance('A', 'A') == 0
    assert hamming_distance('HELLO', 'HELLO') == 0
    assert hamming_distance('ABC', 'DEF') == 3
    assert hamming_distance('ABCXDEF', 'ABCYDEF') == 1