Exemplo n.º 1
0
 def get_best(self, error):
     """Method to calculate channel model probability for errors."""
     candidates = self.gen_candidates(error.lower())
     p = [0] * len(candidates)
     for i, candidate_ in enumerate(candidates):
         candidate = candidate_[-1]
         p_ew_candidate = []
         for res in editops(candidate, error):
             editop, w_idx, e_idx = res
             if editop == 'replace':
                 e = error[e_idx]
                 w = candidate[w_idx]
             elif editop == 'insert':
                 e = error[e_idx - 1:e_idx + 1]
                 w = candidate[w_idx - 1]
             elif editop == 'delete':
                 if e_idx != 0:
                     e = error[e_idx - 1]
                     w = candidate[w_idx - 1:w_idx + 1]
                 else:
                     e = error[e_idx]
                     w = candidate[w_idx:w_idx + 2]
             else:
                 print(editops(candidate, error))
                 return error
             p_ew_candidate.append(self.pm.P_ew(editop, e, w))
         p[i] = self.pm.P_w(candidate) * reduce(
             lambda x, y: x * y, p_ew_candidate) / len(p_ew_candidate)
     try:
         best_idx = p.index(max(p))
         return (candidates[best_idx][-1])
     except ValueError:
         return error
         print(editops(candidate, error))
Exemplo n.º 2
0
 def get_editops(self):
     if not self._editops:
         if self._opcodes:
             self._editops = editops(self._opcodes, self._str1, self._str2)
         else:
             self._editops = editops(self._str1, self._str2)
     return self._editops
Exemplo n.º 3
0
 def get_result(self):
     min_pair = min(itertools.combinations(self.input_content, 2),
                    key=lambda pair: len(editops(*pair)))
     pos_to_remove = [pos[1] for pos in editops(*min_pair)]
     final_str = ''.join([
         char for i, char in enumerate(min_pair[0])
         if i not in pos_to_remove
     ])
     return final_str
Exemplo n.º 4
0
def find_best_pair(left_transcriptions, right_transcriptions):
    prep_left, prep_right = prepare_transcriptions_for_levenshtein(left_transcriptions[0], right_transcriptions[0])
    best_ops = prepare_ops(editops(prep_left, prep_right))
    best_phone_error_rate = (best_ops['replace'] + best_ops['delete'] + best_ops['insert']) / float(len(prep_left))
    best_pair = (left_transcriptions[0], right_transcriptions[0])
    for cur_left in left_transcriptions:
        for cur_right in right_transcriptions:
            prep_left, prep_right = prepare_transcriptions_for_levenshtein(cur_left, cur_right)
            cur_ops = prepare_ops(editops(prep_left, prep_right))
            cur_phone_error_rate = (cur_ops['replace'] + cur_ops['delete'] + cur_ops['insert']) / float(len(prep_left))
            if cur_phone_error_rate < best_phone_error_rate:
                best_phone_error_rate = cur_phone_error_rate
                best_ops = copy.copy(cur_ops)
                best_pair = (cur_left, cur_right)
    return best_pair, best_ops
Exemplo n.º 5
0
	def phonetic_distance(self,other,normalized=False):
		phonemes1=self.phonemes()
		phonemes2=other.phonemes()

		p2chr={}
		for p in phonemes1+phonemes2:
			pstr=p.phon_str
			if not pstr in p2chr: p2chr[pstr]=chr(len(p2chr))

		chr1=''.join([p2chr[p.phon_str] for p in phonemes1])
		chr2=''.join([p2chr[p.phon_str] for p in phonemes2])


		from Levenshtein import editops
		dist=0.0
		for edit_type,index1,index2 in editops(chr1,chr2):
			if edit_type!='replace':
				dist+=1
				continue
			try:
				p1=phonemes1[index1]
				p2=phonemes2[index2]
				#print edit_type,p1,p2,index1,index2 #,p1.distance(p2)
				dist+=p1.distance(p2)
			except IndexError:
				dist+=1

		if normalized: return dist / float(max(len(phonemes1),len(phonemes2)))
		return dist
Exemplo n.º 6
0
 def __call__(self, word, candidate_correction):
     prob = 0.
     word = u'$' + word
     candidate_correction = u'$' + candidate_correction
     ops = editops(word, candidate_correction)
     for op, spos, dpos in ops:
         nominator, denominator = None, None
         if op == "insert":
             c2 = candidate_correction[dpos]
             c1 = word[spos - 1]
             nominator = self.insertion_cm[self.char_to_index[c1], self.char_to_index[c2]]
             denominator = float(self.counts_dict[c1])
         if op == "delete":
             c2 = word[spos]
             c1 = word[spos - 1]
             nominator = self.deletion_cm[self.char_to_index[c1], self.char_to_index[c2]]
             denominator = float(self.counts_dict[c1 + c2])
         if op == "replace":
             c2 = candidate_correction[dpos]
             c1 = word[spos]
             nominator = self.substitution_cm[self.char_to_index[c1], self.char_to_index[c2]]
             denominator = float(self.counts_dict[c1])
         if nominator is not None and denominator is not None:
             prob += np.log(nominator / denominator)
     return prob
Exemplo n.º 7
0
def p_of_w_given_c(w, c):
    prob = 0.
    w = u'$' + w
    candidate_correction = u'$' + c
    ops = editops(w, candidate_correction)
    for op, spos, dpos in ops:
        nominator, denominator = None, None
        if op == "insert":
            c2 = candidate_correction[dpos]
            c1 = w[spos - 1]
            nominator = insertion_cm[char_to_index[c1], char_to_index[c2]]
            denominator = float(counts_dict[c1])
        if op == "delete":
            c2 = w[spos]
            c1 = w[spos - 1]
            nominator = deletion_cm[char_to_index[c1], char_to_index[c2]]
            denominator = float(counts_dict[c1 + c2])
        if op == "replace":
            c2 = candidate_correction[dpos]
            c1 = w[spos]
            nominator = substitution_cm[char_to_index[c1], char_to_index[c2]]
            denominator = float(counts_dict[c1])
        if nominator is not None and denominator is not None:
            prob += np.log(nominator / denominator)
    return prob
Exemplo n.º 8
0
def test_dependency(a, b):
    """
    Test the assumption that the number of editops is equivalent to the
    Levenshtein edit distance.
    """
    from Levenshtein import distance, editops  # type: ignore
    assert distance(a, b) == len(editops(a, b))
Exemplo n.º 9
0
def diffAlign(s1, s2):
    matching = matching_blocks(
        editops(s1, s2), s1, s2)
    s1 = [c for c in s1]
    s2 = [c for c in s2]
    s1n = []
    s2n = []
    popped1, added1 = 0, 0
    popped2, added2 = 0, 0
    for mb in matching:
        index1 = mb[0]
        index2 = mb[1]
        # Controllo allineamento indici
        while(popped1 < index1):
            s1n.append(s1.pop(0))
            popped1 += 1
        while(popped2 < index2):
            s2n.append(s2.pop(0))
            popped2 += 1
        while(len(s1n) < len(s2n)):
            s1n.append("$")
            added1 += 1
        while(len(s2n) < len(s1n)):
            s2n.append("$")
            added2 += 1

        seqLen = mb[2]
        for i in range(seqLen):
            s1n.append(s1.pop(0))
            s2n.append(s2.pop(0))
            popped1, popped2 = popped1 + 1, popped2 + 1

    s1n = "".join(s1n)
    s2n = "".join(s2n)
    return s1n, s2n
Exemplo n.º 10
0
def wordsim2(word1, word2):
    '''return a similarity score for between the two words
    attempt to use a behavioral model of edit distances
    '''
    dist = damerau_levenshtein_distance(str(word1), str(word2))
    if dist > 3:
        return 0
    edit = editops(str(word1), str(word2))
    return match_model(edit)
Exemplo n.º 11
0
 def __call__(self, s1, s2):
     ops = editops(s1, s2)
     replacements = [(spos, dpos) for op_name, spos, dpos in ops if op_name == "replace"]
     count = 0
     for spos, dpos in replacements:
         if s1[spos] in diacritical_chars and diacritical_chars[s1[spos]] == s2[dpos]:
             count += 1
     base_dist = len(ops) - (1 - diacritical_error_punishment) * count
     base_dist -= self.find_all_occurrences_of_substring(u"uw", s1) * (1 - ow_punishment)
     return base_dist
Exemplo n.º 12
0
 def detect_mistakes(self, s1, s2):
     edits = editops(s1, s2)
     for edit_type, pos1, pos2 in edits:
         try:
             mistake = (edit_type, s1[pos1], s2[pos2])
             if mistake not in self.popular_mistakes:
                 mistake = (edit_type, s2[pos2], s1[pos1])
             self.popular_mistakes[mistake] += 1
         except IndexError:
             continue
Exemplo n.º 13
0
def edit_likelihood(x,y, alphabet_size=2, alpha=0.99):

    ops = editops(x,y)
    lp = log(alpha)*(len(y)-len(ops)) # all the unchanged
    for o, _, _ in ops:
        if   o == 'equal':   assert False # should never get here
        elif o == 'replace': lp += log(1.0-alpha) - log(3.0) - log(alphabet_size)
        elif o == 'insert':  lp += log(1.0-alpha) - log(3.0) - log(alphabet_size)
        elif o == 'delete':  lp += log(1.0-alpha) - log(3.0)
        else: assert False
    return lp
Exemplo n.º 14
0
def edit_likelihood(x, y, alphabet_size=2, noise=0.01):

    ops = editops(x, y)
    lp = log(1.0 - noise) * len(y)  # all the unchanged
    for o, _, _ in ops:
        if o == 'equal': lp += log(noise) - log(4.0)
        elif o == 'replace': lp += log(noise) - log(4.0) - log(alphabet_size)
        elif o == 'insert': lp += log(noise) - log(4.0) - log(alphabet_size)
        elif o == 'delete': lp += log(noise) - log(4.0)
        else: assert False
    # print lp, x, y
    return lp
Exemplo n.º 15
0
def edit_likelihood(x,y, alphabet_size=2, noise=0.01):

    ops = editops(x,y)
    lp = log(1.0-noise)*len(y) # all the unchanged
    for o, _, _ in ops:
        if   o == 'equal':   lp += log(noise) - log(4.0)
        elif o == 'replace': lp += log(noise) - log(4.0) - log(alphabet_size)
        elif o == 'insert':  lp += log(noise) - log(4.0) - log(alphabet_size)
        elif o == 'delete':  lp += log(noise) - log(4.0)
        else: assert False
    # print lp, x, y
    return lp
def match_list(A, B, on_replace='delete'):
    """Match two lists of different sizes and return corresponding indice

    created by JR king

    Parameters
    ----------
    A: list | array, shape (n,)
        The values of the first list
    B: list | array: shape (m,)
        The values of the second list

    Returns
    -------
    A_idx : array
        The indices of the A list that match those of the B
    B_idx : array
        The indices of the B list that match those of the A
    """
    from Levenshtein import editops  #pip install python-Levenshtein

    A = np.nan_to_num(np.squeeze(A))
    B = np.nan_to_num(np.squeeze(B))
    assert A.ndim == B.ndim == 1

    unique = np.unique(np.r_[A, B])
    label_encoder = dict((k, v) for v, k in enumerate(unique))

    def int_to_unicode(array):
        return ''.join([str(chr(label_encoder[ii])) for ii in array])

    changes = editops(int_to_unicode(A), int_to_unicode(B))
    B_sel = np.arange(len(B)).astype(float)
    A_sel = np.arange(len(A)).astype(float)
    for type, val_a, val_b in changes:
        if type == 'insert':
            B_sel[val_b] = np.nan
        elif type == 'delete':
            A_sel[val_a] = np.nan
        elif on_replace == 'delete':
            # print('delete replace')
            A_sel[val_a] = np.nan
            B_sel[val_b] = np.nan
        elif on_replace == 'keep':
            # print('keep replace')
            pass
        else:
            raise NotImplementedError
    B_sel = B_sel[np.where(~np.isnan(B_sel))]
    A_sel = A_sel[np.where(~np.isnan(A_sel))]
    assert len(B_sel) == len(A_sel)
    return A_sel.astype(int), B_sel.astype(int)
Exemplo n.º 17
0
def extractPair(correctw, incorrectw):
    matching = matching_blocks(editops(correctw, incorrectw), correctw,
                               incorrectw)
    correctw = [c for c in correctw]
    incorrectw = [c for c in incorrectw]
    for mb in matching:
        for i in range(mb[0], mb[0] + mb[2], 1):
            correctw[i] = None
        for i in range(mb[1], mb[1] + mb[2], 1):
            incorrectw[i] = None
    correctw = "".join([c for c in correctw if c])
    incorrectw = "".join([c for c in incorrectw if c])
    return (correctw, incorrectw)
Exemplo n.º 18
0
 def __call__(self, s1, s2):
     ops = editops(s1, s2)
     replacements = [(spos, dpos) for op_name, spos, dpos in ops
                     if op_name == "replace"]
     count = 0
     for spos, dpos in replacements:
         if s1[spos] in diacritical_chars and diacritical_chars[
                 s1[spos]] == s2[dpos]:
             count += 1
     base_dist = len(ops) - (1 - diacritical_error_punishment) * count
     base_dist -= self.find_all_occurrences_of_substring(
         u"uw", s1) * (1 - ow_punishment)
     return base_dist
Exemplo n.º 19
0
def edit_likelihood(x, y, alphabet_size=2, alpha=0.99):

    ops = editops(x, y)
    lp = log(alpha) * (len(y) - len(ops))  # all the unchanged
    for o, _, _ in ops:
        if o == 'equal': assert False  # should never get here
        elif o == 'replace':
            lp += log(1.0 - alpha) - log(3.0) - log(alphabet_size)
        elif o == 'insert':
            lp += log(1.0 - alpha) - log(3.0) - log(alphabet_size)
        elif o == 'delete':
            lp += log(1.0 - alpha) - log(3.0)
        else:
            assert False
    return lp
Exemplo n.º 20
0
 def probabilities(self, word):
     """
     count number of occurences of word in a document class
     normalize by the all the words
     """
     try:
         N = 0  # all possible candidates - histogram count
         candidates = self.get_most_probable_candidates(word)
         for candidate in candidates:
             for c in self.lex.keys():
                 N += self.lex[c][candidate[0]]
         probabilities = []
         for candidate in candidates:
             Nc = 0
             for c in self.lex.keys():
                 Nc += self.lex[c][candidate[0]]
             P_c = (Nc + self.alpha) / \
                 (N + self.alpha*self.total_words)
             """
             L -> 0 -> p(c) = 1
             L -> len(c) -> p(c) = 0
             p(c) = 1-L/len(c)
             """
             lev = candidate[1]
             edits = editops(candidate[0], word)
             mistake_mod = 0
             for edit_type, pos1, pos2 in edits:
                 try:
                     mistake = (edit_type, candidate[0][pos1], word[pos2])
                     if mistake in self.popular_mistakes.keys():
                         mistake_mod += self.beta * \
                             self.popular_mistakes[mistake]
                     else:
                         mistake = (edit_type, word[pos2],
                                    candidate[0][pos1])
                         if mistake in self.popular_mistakes.keys():
                             mistake_mod += self.beta * \
                                 self.popular_mistakes[mistake]
                 except IndexError:
                     continue
             lev -= mistake_mod
             P_w_c = 1 - (lev / len(word))
             prob = P_w_c * (-1 * np.log(P_c))
             probabilities.append((candidate, prob))
         probabilities.sort(key=lambda x: x[1], reverse=True)
         return probabilities[:np.min([5, len(probabilities) - 1])]
     except KeyError:
         pass
Exemplo n.º 21
0
    def __init__(self, dump_path=None):
        with codecs.open("data/bledy.txt", "r", "utf-8") as f:
            errors, corrections = map(lambda x: list(x), zip(*[line.split(';') for line in f.read().splitlines()]))
        with open("data/alphabet.pkl", 'r') as f:
            self.alphabet = list(pickle.load(f)) + [u'$']
        n = len(self.alphabet)
        self.char_to_index = {c: i for i, c in enumerate(self.alphabet)}

        self.deletion_cm = np.ones((n, n))
        self.insertion_cm = np.ones((n, n))
        self.substitution_cm = np.ones((n, n))

        self.counts_dict = defaultdict(df)
        model = CountVectorizer(analyzer='char', ngram_range=(1, 2))

        errors = [u'$' + error for error in errors]
        corrections = [u'$' + correction for correction in corrections]

        counts_dense = model.fit_transform(errors + corrections).todense().sum(axis=0)
        for i, feature_name in enumerate(model.get_feature_names()):
            self.counts_dict[feature_name] += counts_dense[0, i]

        for error, correction in zip(errors, corrections):
            ops = editops(error, correction)
            for op, spos, dpos in ops:
                if op == 'insert':
                    c2 = correction[dpos]
                    c1 = error[spos - 1]
                    self.insertion_cm[self.char_to_index[c1], self.char_to_index[c2]] += 1
                if op == 'delete':
                    c2 = error[spos]
                    c1 = error[spos - 1]
                    self.deletion_cm[self.char_to_index[c1], self.char_to_index[c2]] += 1
                if op == 'replace':
                    c2 = correction[dpos]
                    c1 = error[spos]
                    self.substitution_cm[self.char_to_index[c1], self.char_to_index[c2]] += 1
        if dump_path is not None:
            with h5py.File(dump_path, 'w') as f:
                f.create_dataset('i', data=self.insertion_cm)
                f.create_dataset('d', data=self.deletion_cm)
                f.create_dataset('s', data=self.substitution_cm)
            with open('data/counts.pkl', 'w') as f:
                pickle.dump(self.counts_dict, f)
            with open('data/char_to_index.pkl', 'w') as f:
                pickle.dump(self.char_to_index, f)
 def select(self, token, pos):
     """
         retrieve best lemma, lookup if possible, otherwise 
     """
     #first retrieve precomputed results
     if (token,pos) in self.precomputed:
         return self.precomputed[(token,pos)]
     # take subset of valid lemmas
     pos_pli = self.pli[pos]
     # test on smallest edit length
     candidates_le = []
     cur_len = 10000
     tok_len = len(token)
     for candidate in pos_pli:
         lc = len(candidate)
         if lc - tok_len > cur_len:
             # too many inserts required in best case scenario, skip
             continue
         if tok_len - lc > cur_len:
             # too many deletes required in best case scenario, skip
             continue
         edits = editops(token, candidate)
         le = len(edits)
         if le < cur_len:
             candidates_le = [(candidate, edits)]
             cur_len = le
         elif le == cur_len: candidates_le.append((candidate, edits))
     if len(candidates_le) == 1 :
         self.precomputed[(token,pos)] = candidates_le[0][0]
         return candidates_le[0][0]
     # test on highest index
     cur_len = -1
     candidates_lo = []
     for candidate, edits in candidates_le:
         lo = sum(i[-1] for i in edits) / le
         if lo > cur_len:
             candidates_lo = [(candidate, edits)]
             cur_len = lo
         elif lo == cur_len: candidates_lo.append((candidate, edits))
     if len(candidates_lo) == 1 :
         self.precomputed[(token,pos)] = candidates_lo[0][0]
         return candidates_lo[0][0]
     # test on most frequent
     slct = max(candidates_lo, key=lambda c: pos_pli[c[0]])[0]
     self.precomputed[(token,pos)] = slct
     return slct
Exemplo n.º 23
0
def determine_fix_event(file_a: bytes, file_b: bytes) -> FixEvent:
    """
    For two source files with Levenshtein distance of one, this returns the
    edit that converts the first file into the second file.
    """
    src = tokens2seq(java.tokenize(file_a))
    dest = tokens2seq(java.tokenize(file_b))
    ops = editops(src, dest)
    # This only works for files with one edit!
    assert len(ops) == 1

    # Decode editop's format.
    (type_name, src_pos, dest_pos), = ops
    edit_type = to_edit_type(type_name)
    new_token = None if edit_type is Deletion else from_pua(dest[dest_pos])
    edit = Edit(edit_type, dest_pos, new_token)

    return FixEvent(edit, edit, 0)
Exemplo n.º 24
0
    def __init__(self, a: Iterable[Token], b: Iterable[Token],
                 convert: TokenConverter) -> None:
        self.src_toks = tuple(a)
        self.dest_toks = tuple(b)

        # Convert each token to an appropriate stringified representation.
        self.src_text = tuple(convert(tok) for tok in self.src_toks)
        self.dest_text = tuple(convert(tok) for tok in self.dest_toks)

        # Because python-Levenshtein calculates string distances exclusively,
        # synthesize "strings" by mapping each of the strings in the token
        # sequence to a single character.
        mapper = PrivateUseAreaMapper()
        src_str = ''.join(mapper[token] for token in self.src_text)
        dest_str = ''.join(mapper[token] for token in self.dest_text)

        # Determine the Levenstein edit operations.
        self._edit_ops = editops(src_str, dest_str)
Exemplo n.º 25
0
def edit_dist(r_string: str, s_string: str) -> int:
    """Computes edit distance.

    ED(r, s) = number of edit operations

    Parameters
    ----------
    r_string : str
        First string.
    s_string : str
        Second string.

    Returns
    -------
    Edit distance of r and s.

    """

    return round(len(editops(r_string, s_string)), 3)
Exemplo n.º 26
0
    def get_editops(str_from, str_to, allow_replace=True, allow_copy=True):
        """Gets edit operations from **str_from** to **str_to** according to
        Levenstein distance. Supported edit operations are: `'delete'`,
        `'insert'`, `'replace'`, `'copy'`.

        Args:

        **str_from** (`str`): the source string.

        **str_to** (`str`): the target string.

        **allow_replace** (`bool`; default is `True`): whether to allow the
        **replace** edit operation.

        **allow_copy** (`bool`; default is `True`): whether to allow the
        **copy** edit operation.

        Returns the `tuple` of edit operations that is needed to transform
        **str_from** to **str_to**.
        """
        res = []
        for op, idx_dst, idx_src in editops(str_from, str_to):
            if op == 'delete':
                res.append(('d', idx_dst, None))
            else:
                ch_src = str_to[idx_src]
                if op == 'replace' and allow_replace:
                    res.append(('r', idx_dst, ch_src))
                elif op in ['insert', 'replace']:
                    op_prev, idx_prev, ch_prev = res[-1] if res else [0] * 3
                    if allow_copy and idx_prev \
                                  and str_from[idx_prev - 1] == ch_src \
                                  and (op_prev == 'c' or idx_prev != idx_dst):
                        res.append(('c', idx_dst, None))
                    else:
                        res.append(('i', idx_dst, str_to[idx_src]))
                    if op == 'replace':
                        res.append(('d', idx_dst, None))
                else:
                    raise ValueError("Unexpected operation code '{}'"
                                         .format(op))
        return tuple(res)
def process_changes(start: str, end: str, errors: dict) -> None:
    changes = editops(start, end)
    for change in changes:
        orig = ''
        update = ''
        if change[0] == 'replace':
            orig = start[change[1]]
            update = end[change[2]]
        elif change[0] == 'insert':
            orig = '*'
            update = end[change[2]]
        elif change[0] == 'delete':
            orig = start[change[1]]
            update = '*'
        else:
            print('Error: change value is ' + str(change[0]))
        k = (orig, update)
        if k in errors.keys():
            errors[k] += 1
        else:
            errors[k] = 1
Exemplo n.º 28
0
	def phonetic_distance(self,other,normalized=False):
		phonemes1=self.phonemes()
		phonemes2=other.phonemes()

		p2chr={}
		for p in phonemes1+phonemes2:
			pstr=p.phon_str
			if not pstr in p2chr: p2chr[pstr]=unichr(len(p2chr))

		chr1=u''.join([p2chr[p.phon_str] for p in phonemes1])
		chr2=u''.join([p2chr[p.phon_str] for p in phonemes2])

		# @HACK @TODO
		#return 0 if chr1==chr2 else 10
		###

		from Levenshtein import editops
		dist=0.0
		for edit_type,index1,index2 in editops(chr1,chr2):
			if edit_type!='replace':
				dist+=1
				continue
			try:
				p1=phonemes1[index1]
				p2=phonemes2[index2]
				#print edit_type,p1,p2,index1,index2 #,p1.distance(p2)
				dist+=p1.distance(p2)
			except IndexError:
				dist+=1

		## @NEW
		# add a distpoint if does not end with same phoneme?
		try:
			if phonemes1[-1]!=phonemes2[-1]: dist+=2
		except IndexError:
			# ???? @TODO
			dist+=2

		if normalized: return dist / float(max(len(phonemes1),len(phonemes2)))
		return dist
Exemplo n.º 29
0
    def edit_operations(token1, token2):
        eds = editops(token1, token2)
        s = ""
        ops = ["none"] * max(len(token2), len(token1))
        counter = 0

        for ed in eds:
            if ed[0] == "insert" or ed[0] == "replace":
                ops[ed[2]] = (ed[0] + "_" + token2[ed[2]])
            elif ed[0] == "delete":
                ops[ed[2]] = (ed[0] + "_" + token1[ed[2]])
            counter += 1

        # Remove trailing "none"s
        #if token1 == "breakbased":
        #	print ops
        #	print "==="
        for x in reversed(range(len(ops))):
            if ops[x] == "none":
                del ops[x]
            else:
                break

        #if token2 == "emergency management officer":
        #	print "EMERGENCY MANAGEMENT OFFICER\n====================="
        #	print token1
        #	print token2
        #	print ops
        #	print "==========================="
        '''
		if ops != []:
			print "<<"
			print "Tokens:      " + token1, token2
			print "Eds:         " + str(eds) 
			print "Operations:  " + str(tuple(ops))
			print ">>"
		'''

        return tuple(ops)
Exemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"])
    parser.add_argument("-m", "--match_threshold", type=float, default=0.9, help=ARG_HELP_STRINGS["match_threshold"])
    parser.add_argument("-a", "--ask_threshold", type=float, default=0.8, help=ARG_HELP_STRINGS["ask_threshold"])
    parser.add_argument("-c", "--colors", type=bool, default=COLORS_DEFAULT, help=ARG_HELP_STRINGS["ansi_colors"])
    parser.add_argument("--start", type=int, default=0, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("--end", type=int, default=inf, help=ARG_HELP_STRINGS["end"])
    args = parser.parse_args()
    
    header = None
    additional_fields = ["doi", "similarity"]
    
    with open(args.title_file, "r") as f:
        reader = csv.DictReader(f)
        title_field = None
        for field in reader.fieldnames:
            if field.lower() in TITLE_HEADER_WL:
                print(colorise("Using column '" + field + "' as title column", "green"))
                title_field = field
                break
        else:
            print(colorise("ERROR: Could not find a column name which might denote a title column", "red"))
            sys.exit()
        header = reader.fieldnames
        for field in additional_fields:
            if field not in header:
                header.append(field)
        modified_lines = []
        ask_count = 0
        for line in reader:
            line["ask"] = False
            if reader.line_num < args.start or reader.line_num > args.end:
                continue
            print(BREAK)
            title = line[title_field]
            head = "line " + str(reader.line_num) + ", query title:"
            print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue"))
            ret = crossref_query_title(title)
            retries = 0
            while not ret['success'] and retries < MAX_RETRIES_ON_ERROR:
                retries += 1
                msg = "Error while querying CrossRef API ({}), retrying ({})...".format(ret["exception"], retries)
                print(colorise(msg, "red"))
                ret = crossref_query_title(title)
            result = ret["result"]
            msg_tail = "'{}' [{}]"
            msg_tail = msg_tail.format(result["crossref_title"], result["doi"])
            if result["similarity"] == 1.0:
                msg_head = "Perfect match found ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "cyan"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.match_threshold:
                msg_head = "Good match found ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "green"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.ask_threshold:
                msg_head = "Possible match found ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "yellow"))
                line.update(result)
                line["line_num"] = reader.line_num
                line["ask"] = True
                ask_count += 1
            else:
                msg_head = "No match found, most similar was ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "red"))
                line.update(EMPTY_RESULT)
                line["ask"] = False
            modified_lines.append(line)
        if ask_count > 0:
            print(BREAK)
            ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:"
            ask_msg = ask_msg.format(ask_count, args.ask_threshold, args.match_threshold)
            print(colorise(ask_msg, "green"))
        for line in modified_lines:
            if line["ask"]:
                print(BREAK)
                query_t = line[title_field]
                xref_t = line["crossref_title"]
                # display matching segments in identical colors for easier recognition
                diff = matching_blocks(editops(query_t.lower(), xref_t.lower()), query_t, xref_t)
                query_print = query_t
                xref_print = xref_t
                # ANSI codes increase string length, so we need an offset to compensate
                offset = 0
                for i in range(len(diff)):
                    a, b, c = diff[i]
                    a += offset
                    b += offset
                    offset += 9
                    color = CMP_COLORS[i % len(CMP_COLORS)]
                    query_print = colorise_text_segment(query_print, a, a + c , color)
                    xref_print = colorise_text_segment(xref_print, b, b + c , color)
                query_head = colorise("line {}, query title:".format(line["line_num"]), "blue")
                xref_head = colorise("Possible match ({}):".format(round(line["similarity"], 2)), "yellow")
                print(query_head.ljust(L_JUST) + query_print)
                print(xref_head.ljust(L_JUST) + xref_print)
                answer = input("Do you want to accept the DOI for the match title? (y/n):")
                while answer not in ["y", "n"]:
                    answer = input("Please type 'y' or 'n':")
                if answer == "n":
                    line.update(EMPTY_RESULT)
                
        with open("out.csv", "w") as out:
            dialect = csv.excel
            dialect.quoting = csv.QUOTE_ALL
            writer = csv.DictWriter(out, header, extrasaction='ignore', dialect=dialect)
            writer.writeheader()
            writer.writerows(modified_lines)
Exemplo n.º 31
0
def renew_ann(old_txt_fn,
              old_ann_fn,
              new_txt_fn,
              save_new_ann_to,
              rewrite=False):
    """If we have a brat annotation for some txt-file already done, and we
    have to change that txt slightly, this method helps you to adjust the
    annotation for new version of the txt.

    :param old_txt_fn: a path to the old txt-file.
    :param old_ann_fn: a path to the old ann-file.
    :param new_txt_fn: a path to the new txt-file.
    :param save_new_ann_to: a path where the renewed ann will be saved to.
    :param rewrite: if ``True``, allow *save_new_ann_to* be equal to
                    *old_ann_fn*. Default is ``False``.
    """
    assert rewrite or save_new_ann_to != old_ann_fn, \
        'ERROR: use `rewrite=True` param if you really want to change ' \
        'original ann-file'

    from Levenshtein import editops

    with io.open(old_txt_fn, 'rt', encoding='utf-8', newline='') as f:
        old_txt = f.read()
    with io.open(new_txt_fn, 'rt', encoding='utf-8', newline='') as f:
        new_txt = f.read()
        if '\r' in new_txt:
            print('WARNING: The new txt file ("{}") file contain "CR" '
                  'symbols that may cause errors ("nonprintable characters") '
                  'in brat engine. Consider to remove "CR" symbols from new '
                  'txt file and renew annotations again'.format(new_txt_fn))
    with open(old_ann_fn, 'rt', encoding='utf-8') as f:
        old_ann = f.read().split('\n')
    transfer_map, shift, prev_idx = list(range(len(old_txt) + 1)), 0, 0
    for op, idx_src, idx_dst in editops(old_txt, new_txt):
        if shift:
            for idx in range(prev_idx, idx_src):
                transfer_map[idx] += shift
        if op == 'insert':
            shift += 1
            prev_idx = idx_src
        elif op == 'replace':
            transfer_map[idx_src] += shift
            prev_idx = idx_src + 1
        elif op == 'delete':
            transfer_map[idx_src] = None
            shift -= 1
            prev_idx = idx_src + 1
        else:
            raise ValueError('ERROR: Unknown operation "{}"'.format(op))
    if shift:
        for idx in range(prev_idx, len(old_txt) + 1):
            transfer_map[idx] += shift
    new_ann = []
    len_old_txt, len_new_txt = len(old_txt), len(new_txt)
    old_aids, new_aids, aid_map, all_spans = set(), set(), {}, {}
    for line_no, ann in enumerate(old_ann, start=1):
        ann = ann.split('\t')
        if not ann:
            continue
        aid = ann[0]
        old_aids.add(aid)
        chunks_old = ann[1].split(';') if len(ann) >= 2 else []
        chunks_new, spans, fragments = [], [], []
        for chunk_old in chunks_old:
            chunk_new, span, idx0 = [], [], None
            for token in chunk_old.split():
                if token.isdigit():
                    idx_ = int(token)
                    assert idx_ <= len_old_txt, \
                        'ERROR: Position "{}" in line {} is outside of ' \
                        'bounds of the file {}' \
                            .format(idx_, line_no, old_ann_fn)
                    # if we have a start of the fragment
                    if not span:
                        # search for transfer not to None
                        for idx in transfer_map[idx_:]:
                            if idx is not None:
                                idx0 = idx
                                if idx == len_new_txt:
                                    idx = None
                                else:
                                    # if the old fragment starts after ' ',
                                    # the new one should do, too
                                    if idx_ == 0 \
                                    or old_txt[idx_ - 1].isspace():
                                        for i in range(idx, 0, -1):
                                            if new_txt[i - 1].isspace():
                                                idx = i
                                                break
                                        else:
                                            idx = 0
                                    # anyway, we can't point to ' '
                                    for i in range(idx, len_new_txt):
                                        if not new_txt[i].isspace():
                                            idx = i
                                            break
                                    else:
                                        idx = None
                                    #idx0 = idx
                                if idx is None:
                                    token = None
                                else:
                                    span.append(idx)
                                    token = str(idx)
                                break
                        else:
                            break
                    # end of the fragment
                    else:
                        for idx in reversed(transfer_map[:idx_ + 1]):
                            if idx is not None:
                                if idx <= idx0:
                                    idx = span = None
                                else:
                                    # if the old fragment ends with ' ',
                                    # the new one should do, too
                                    if idx_ == len_old_txt \
                                    or old_txt[idx_].isspace():
                                        for i in range(idx, len_new_txt):
                                            if new_txt[i].isspace():
                                                idx = i
                                                break
                                        else:
                                            idx = len_new_txt
                                    # anyway, we don't want to have ' '
                                    # in the end
                                    for i in range(idx, 0, -1):
                                        if not new_txt[i - 1].isspace():
                                            idx = i
                                            break
                                    else:
                                        idx = None
                                if idx is None:
                                    token = None
                                else:
                                    span.append(idx)
                                    token = str(idx)
                                break
                        else:
                            break
                if token is None or (token in old_aids
                                     and token not in new_aids):
                    token = aid_map.get(token)
                    if not token:
                        chunk_new = None
                        break
                chunk_new.append(token)
            if span:
                if len(span) != 2:
                    print(span)
                assert len(span) == 2, 'ERROR: Invalid line {} in {} file' \
                                           .format(line_no, old_ann_fn)
                for span_ in spans:
                    if span[0] >= span_[0] and span[0] < span_[1]:
                        span[0] = span_[1]
                    if span[1] > span_[0] and span[1] <= span_[1]:
                        span[1] = span_[0]
                if span[1] > span[0]:
                    for i, span_ in enumerate(reversed(spans), start=1):
                        if span[0] == span_[1]:
                            span_[1] = span[1]
                        elif span[1] == span_[0]:
                            span_[0] = span[0]
                        else:
                            continue
                        ###
                        frag_start, frag_end = span
                        frags_ = new_txt[frag_start:frag_end].split('\n')
                        fragment = ''
                        chunk_new = [str(frag_start), str(frag_end)]
                        for i, frag in enumerate(frags_):
                            frag_len = len(frag)
                            if frag_len:
                                frag_end = frag_start + frag_len
                                frag_start = frag_end + 1
                                chunk_new[0] += ' ' + str(frag_end) + \
                                                ';' + str(frag_start)
                                fragment += frag + ' '
                        fragment += frags_[-1]
                        chunks_new[-i][-2:] = chunk_new
                        fragments[-i] = fragment
                        ###
                        #chunks_new[-i][-2:] = [str(span_[0]), str(span_[1])]
                        #fragments[-i] = new_txt[span_[0]:span_[1]]
                        chunk_new = None
                        break
                    else:
                        spans.append(span)
                        if len(chunk_new) > 2:
                            chunks_new.append(chunk_new[:-2])
                        ###
                        frag_start, frag_end = span
                        frags_ = new_txt[frag_start:frag_end].split('\n')
                        fragment = ''
                        chunk_new = [str(frag_start), str(frag_end)]
                        for i, frag in enumerate(frags_[:-1]):
                            frag_len = len(frag)
                            if frag_len:
                                frag_end = frag_start + frag_len
                                frag_start = frag_end + 1
                                chunk_new[0] += ' ' + str(frag_end) + \
                                                ';' + str(frag_start)
                                fragment += frag + ' '
                        fragment += frags_[-1]
                        fragments.append(fragment)
                        ###
                        #chunk_new = [str(span[0]), str(span[1])]
                        #fragments.append(new_txt[span[0]:span[1]])
                else:
                    chunk_new = None
            if chunk_new is None:
                if not chunks_new:
                    break
            else:
                chunks_new.append(chunk_new)
        if spans:
            order_ = [
                i for _, i in sorted((x, i) for i, x in enumerate(spans))
            ]
            spans = [spans[i] for i in order_]
            chunk_ = ' '.join(chunks_new[0])
            chunks_new_ = [chunks_new[i + 1] for i in order_]
            fragments_ = [fragments[i] for i in order_]
            end_pos = None
            chunks_new, fragments = [], []
            for span, chunk_new, fragment in zip(spans, chunks_new_,
                                                 fragments_):
                if end_pos:
                    for c in new_txt[end_pos:span[0]]:
                        if c != ' ':
                            chunks_new.append(chunk_new)
                            fragments.append(fragment)
                            break
                    else:
                        chunks_new[-1][1] = chunk_new[1]
                        fragments[-1] += ' ' * (span[0] - end_pos) + fragment
                else:
                    chunks_new.append(chunk_new)
                    fragments.append(fragment)
                end_pos = span[1]
            chunks_new[0].insert(0, chunk_)
        if chunks_old and chunks_new:
            if chunks_new:
                chunks_new = ';'.join(' '.join(x) for x in chunks_new)
                if chunks_new in all_spans:
                    aid_map[aid] = all_spans[chunks_new]
                    continue
                all_spans[chunks_new] = aid
                chunks_new = [chunks_new]
            else:
                chunks_new = []
            new_ann.append(
                '\t'.join([aid] + chunks_new +
                          ([' '.join(fragments)] if fragments else ann[2:])))
            new_aids.add(aid)
    with io.open(save_new_ann_to, 'wt', encoding='utf=8', newline='\n') as f:
        if new_ann:
            f.write('\n'.join(new_ann) + '\n')
Exemplo n.º 32
0
def main(data_path, model_path, epochs):
    with open(os.path.join(model_path, 'train.txt')) as train_file:
        train = [x.strip() for x in train_file.readlines()]
    with open(os.path.join(model_path, 'test.txt')) as test_file:
        test = [x.strip() for x in test_file.readlines()]
    csv_logger = CSVLogger(os.path.join(model_path, 'Log1.csv'))
    dataset = Dataset(data_path)
    signal_seq = ExampleSequence(dataset,
                                 train,
                                 name='train',
                                 batch_size=batch_size)
    test_seq = ExampleSequence(dataset,
                               test,
                               name='test',
                               batch_size=batch_size)
    model = load_model(os.path.join(model_path, 'model.h5'),
                       custom_objects={
                           '<lambda>': lambda y_true, y_pred: y_pred
                       })
    model = multi_gpu_model(model, gpus=2)
    param = {
        'lr': 0.001,
        'beta_1': 0.9,
        'beta_2': 0.999,
        'epsilon': None,
        'clipvalue': 2
    }
    adam = optimizers.Adam(**param)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
    model.fit_generator(signal_seq,
                        validation_data=test_seq,
                        epochs=epochs,
                        callbacks=[csv_logger])
    model.save(os.path.join(model_path, 'model_1.h5'))
    sub_model = model.get_layer('model_2')
    sub_model = sub_model.get_layer('model_1')
    im_model = Model(inputs=sub_model.get_input_at(0),
                     outputs=sub_model.get_layer('activation_1').output)
    dists = []
    ops = []
    lens = []
    pred_lens = []
    real = []
    predicted = []
    for j in range(len(test_seq)):
        batch = test_seq[j][0]
        preds = im_model.predict_on_batch(batch)
        val = K.ctc_decode(preds,
                           np.full(batch_size, batch['input_length'][0, 0]),
                           greedy=False)
        decoded = K.eval(val[0][0])
        for i in range(decoded.shape[0]):
            real_label = batch['the_labels'][i, :batch['label_length'][i, 0]]
            real_label = ''.join([str(int(x)) for x in real_label.tolist()])
            pred_label = list(filter(lambda x: x != -1,
                                     decoded[i, :].tolist()))
            pred_label = [str(x) for x in pred_label]
            pred_label = ''.join(pred_label)
            dists.append(distance(pred_label, real_label))
            ops.append(editops(pred_label, real_label))
            lens.append(len(real_label))
            pred_lens.append(len(pred_label))
            real.append(real_label)
            predicted.append(pred_label)
    op_counts = {'insert': 0, 'replace': 0, 'delete': 0}
    for op in ops:
        for x in op:
            op_counts[x[0]] += 1
    for key in op_counts.keys():
        op_counts[key] = op_counts[key] / sum(lens)
    metrics = {
        'LER': sum(dists) / sum(lens),
        'real_mean_length': np.mean(lens),
        'predicted_mean_length': np.mean(pred_lens)
    }
    metrics.update(op_counts)
    metrics_file_path = os.path.join(model_path, 'metrics_continue.json')
    write_dict_to_file(metrics_file_path, metrics)
Exemplo n.º 33
0
def lcs(s1, s2):
    z = matching_blocks(editops(s1, s2),s1, s2)
    return np.max(list(zip(*z))[2])
Exemplo n.º 34
0
 def _get_matching_blocks(query, text):
     return matching_blocks(editops(query, text), query, text)
Exemplo n.º 35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"])
    parser.add_argument("-m",
                        "--match_threshold",
                        type=float,
                        default=0.9,
                        help=ARG_HELP_STRINGS["match_threshold"])
    parser.add_argument("-a",
                        "--ask_threshold",
                        type=float,
                        default=0.8,
                        help=ARG_HELP_STRINGS["ask_threshold"])
    parser.add_argument("-c",
                        "--colors",
                        type=bool,
                        default=COLORS_DEFAULT,
                        help=ARG_HELP_STRINGS["ansi_colors"])
    parser.add_argument("--start",
                        type=int,
                        default=0,
                        help=ARG_HELP_STRINGS["start"])
    parser.add_argument("--end",
                        type=int,
                        default=inf,
                        help=ARG_HELP_STRINGS["end"])
    args = parser.parse_args()

    header = None
    additional_fields = ["doi", "similarity"]

    with open(args.title_file, "r") as f:
        reader = csv.DictReader(f)
        title_field = None
        for field in reader.fieldnames:
            if field.lower() in TITLE_HEADER_WL:
                print(
                    colorise("Using column '" + field + "' as title column",
                             "green"))
                title_field = field
                break
        else:
            print(
                colorise(
                    "ERROR: Could not find a column name which might denote a title column",
                    "red"))
            sys.exit()
        header = reader.fieldnames
        for field in additional_fields:
            if field not in header:
                header.append(field)
        modified_lines = []
        ask_count = 0
        for line in reader:
            line["ask"] = False
            if reader.line_num < args.start or reader.line_num > args.end:
                continue
            print(BREAK)
            title = line[title_field]
            head = "line " + str(reader.line_num) + ", query title:"
            print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue"))
            ret = crossref_query_title(title)
            retries = 0
            while not ret['success'] and retries < MAX_RETRIES_ON_ERROR:
                retries += 1
                msg = "Error while querying CrossRef API ({}), retrying ({})...".format(
                    ret["exception"], retries)
                print(colorise(msg, "red"))
                ret = crossref_query_title(title)
            result = ret["result"]
            msg_tail = "'{}' [{}]"
            msg_tail = msg_tail.format(result["crossref_title"], result["doi"])
            if result["similarity"] == 1.0:
                msg_head = "Perfect match found ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "cyan"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.match_threshold:
                msg_head = "Good match found ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "green"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.ask_threshold:
                msg_head = "Possible match found ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "yellow"))
                line.update(result)
                line["line_num"] = reader.line_num
                line["ask"] = True
                ask_count += 1
            else:
                msg_head = "No match found, most similar was ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "red"))
                line.update(EMPTY_RESULT)
                line["ask"] = False
            modified_lines.append(line)
        if ask_count > 0:
            print(BREAK)
            ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:"
            ask_msg = ask_msg.format(ask_count, args.ask_threshold,
                                     args.match_threshold)
            print(colorise(ask_msg, "green"))
        for line in modified_lines:
            if line["ask"]:
                print(BREAK)
                query_t = line[title_field]
                xref_t = line["crossref_title"]
                # display matching segments in identical colors for easier recognition
                diff = matching_blocks(
                    editops(query_t.lower(), xref_t.lower()), query_t, xref_t)
                query_print = query_t
                xref_print = xref_t
                # ANSI codes increase string length, so we need an offset to compensate
                offset = 0
                for i in range(len(diff)):
                    a, b, c = diff[i]
                    a += offset
                    b += offset
                    offset += 9
                    color = CMP_COLORS[i % len(CMP_COLORS)]
                    query_print = colorise_text_segment(
                        query_print, a, a + c, color)
                    xref_print = colorise_text_segment(xref_print, b, b + c,
                                                       color)
                query_head = colorise(
                    "line {}, query title:".format(line["line_num"]), "blue")
                xref_head = colorise(
                    "Possible match ({}):".format(round(line["similarity"],
                                                        2)), "yellow")
                print(query_head.ljust(L_JUST) + query_print)
                print(xref_head.ljust(L_JUST) + xref_print)
                answer = input(
                    "Do you want to accept the DOI for the match title? (y/n):"
                )
                while answer not in ["y", "n"]:
                    answer = input("Please type 'y' or 'n':")
                if answer == "n":
                    line.update(EMPTY_RESULT)

        with open("out.csv", "w") as out:
            dialect = csv.excel
            dialect.quoting = csv.QUOTE_ALL
            writer = csv.DictWriter(out,
                                    header,
                                    extrasaction='ignore',
                                    dialect=dialect)
            writer.writeheader()
            writer.writerows(modified_lines)
Exemplo n.º 36
0
def do(input_file_name, backward_target_length, dest_folder_path):
    cur_cnt = 0
    target_cnt = count_line_in_file(input_file_name)
    final_results = {}

    for target_set in open(input_file_name, 'r'):
        # 인풋 파일은 이름 : 와일드시퀀스 : 타겟 의 형태를 띈다.
        tmp = target_set.split(':')
        # 파일이름에 화이트스페이스 제거
        file_name_no_ext = tmp[0].strip()
        # 파일이름에 확장자 추가
        file_name = '{}.txt'.format(file_name_no_ext)
        # 와일드 시퀸스에 화이트 스페이스 제거
        wild_seq = tmp[1].strip()
        # 타겟 시퀸스에 화이트 스페이스 제거
        target = tmp[2].strip()

        # 타겟 시퀸스 valid 검사
        target = seq_validator(target)
        if not target:
            continue

        # 와일드 시퀀스 valid 검사
        wild_seq = seq_validator(wild_seq)
        if not wild_seq:
            continue

        # 결과 저장용 폴더 생성
        result_folder_name = os.path.join(BASE_DIR, 'analyse_results')
        if not os.path.exists(result_folder_name):
            os.makedirs(result_folder_name)

        try:
            # 결과 임시 저장 dict
            result = {
                'total_cnt':
                count_line_in_file(os.path.join(dest_folder_path, file_name)),
                'mutated_cnt':
                0,
                'mutated_rates':
                0.0,
                'mutated_dict': {}
            }

            for line in open(os.path.join(dest_folder_path, file_name), 'r'):
                # 대상 시퀀스 valid 검사
                line = seq_validator(line)
                if not line:
                    continue

                # 와일드 시퀀스와 타겟을 이용하여 와일드 시퀀스에서 타겟의 시작, 종료 위치를 파악한다. editops에서 사용.
                target_start_pos_in_wild = int(wild_seq.find(target))
                target_end_pos = target_start_pos_in_wild + len(target)

                # 와일드 시퀀스를 기준으로 대상 시퀀스와 비교하여 레벤슈타인 유사도 측정에서 editops를 뽑아낸다.
                # editops는 (변형방법, 와일드시퀀스 기준 위치, 대상시퀀스 기준 위치) 의 형태로 결과가 나온다.
                # 예를 들어, editops('test', 'teaasz') 의 경우 [('insert', 2, 2), ('insert', 2, 3), ('replace', 3, 5)]
                # 1번 인덱스 : 삽입이 와일드시퀀스 기준 2번째, 대상시퀀스 기준 2번째에서 발생
                # 2번 인덱스 : 삽입이 와일드시퀀스 기준 2번째, 대상시퀀스 기준 3번째에서 발생
                # 3번 인덱스 : 교체가 와일드시퀀스 기준 3번째, 대상시퀀스 기준 5번째에서 발생
                # 때문에 와일드시퀀스에서 타겟의 위치만 정확히 파악한다면 대상시퀀스에서 변형이 어느부분에 일어났는지
                # 몰라도 사용자가 지정한 위치에서의 변형 여부를 충분히 잡아낼 수 있다.
                for mutation_info in editops(wild_seq, line):
                    # 사용자 지정 위치 검사(타겟의 뒤에서부터 backward_target_length 번째까지)
                    if target_end_pos - int(
                            backward_target_length
                    ) <= mutation_info[1] <= target_end_pos:
                        # 교체는 변형으로 치지 않는다.
                        # 또한 교체가 아니면서 대상시퀸스의 문자 N이라면 변형으로 치지않는다.
                        if mutation_info[0] != 'replace' and line[
                                mutation_info[2]] != 'N':
                            # 여기까지 왔다면 변형으로 쳐서 카운트+1
                            result['mutated_cnt'] += 1
                            # 변형된 대상시퀀스를 결과 출력을 위해 저장하고 동일 시퀀스 갯수 조사를 위해 카운팅한다.
                            if line not in result['mutated_dict'].keys():
                                result['mutated_dict'][line] = 1
                            else:
                                result['mutated_dict'][line] += 1
                            break

            # 변형 퍼센티지 계산
            try:
                result['mutated_rates'] = float(
                    result['mutated_cnt']) / result['total_cnt'] * 100
            except:
                result['mutated_rates'] = 0

            # 각 결과값 저장.
            with open(os.path.join(result_folder_name, file_name), 'w') as f:
                f.write('{}\n'.format(wild_seq))
                f.write('--------\n')
                for mutated_seq, cnt in result['mutated_dict'].items():
                    f.write('{} X {}\n'.format(mutated_seq, cnt))
                f.write('--------\n')
                f.write('mutation rates : {} %'.format(
                    result['mutated_rates']))

        except Exception as e:
            print e
            print file_name, ' not found.'
            pass
        else:
            # 문제 없다면 결과물을 모은다.
            final_results[file_name_no_ext] = result

        # 타겟 하나 분석 종료 카운트+1
        cur_cnt += 1
        # 진행율 화면 표시
        progress_percentage = float(cur_cnt) / target_cnt * 100
        print '{} % done'.format(progress_percentage)

        # 최종 결과물 파일 저장.
        with open(os.path.join(result_folder_name, 'result_info.txt'),
                  'w') as f:
            for name, data in final_results.items():
                f.write('{} : {} : {}/{}\n'.format(name, data['mutated_rates'],
                                                   data['mutated_cnt'],
                                                   data['total_cnt']))
Exemplo n.º 37
0
def fix_triggers(events_meg, events_behavior, event_type):
    # def fix_triggers(events_meg, events_behavior, event_type='triggTarget'):

    """ Use this function when the triggers are not identical between the meg
    and the behavioral file output by Matlab.
    """
    from nose.tools import assert_true
    from Levenshtein import editops
    # copy because can change data in place
    events_meg = np.copy(events_meg)
    events_behavior = events_behavior.copy()
    # initialize new field in behavioral file
    n_trials = len(events_behavior)
    events_type_labels = ['triggTarget', 'triggCue', 'triggProbe']
    for label in events_type_labels:
        events_behavior[label + '_missing'] = np.zeros(n_trials, bool)

    # concatenate all behavioral events into one long series of triggers
    events_behavior_triggers = np.reshape(np.vstack((
        events_behavior.triggTarget,
        events_behavior.triggCue,
        events_behavior.triggProbe)).T, [-1])

    # Identify missed, exchanged or additional trigger values in MEG as
    # compared to behavioral file

    def int_to_unicode(array):
        return ''.join([str(chr(int(ii))) for ii in array])

    changes = editops(int_to_unicode(events_behavior_triggers), int_to_unicode(events_meg[:, 2]))

    # for each modification
    print changes
    for modification, from_trigger, _ in changes:
        if modification == 'delete':
            this_trial = np.floor(from_trigger / 3.)
            this_event_type = int(from_trigger % 3.)
            # set False value to trigg[Type]_missing
            this_key = events_type_labels[this_event_type] + '_missing'
            events_behavior.set_value(this_trial, this_key, True)
        else:
            # TODO: implement other types of deletion, replacement etc error
            NotImplementedError()
            # TODO: remove or add elements in events_meg
    events_behavior['trial'] = range(len(events_behavior))
    # ---- make sure to have modulo 3
    events_meg = events_meg[:(3 * np.floor(len(events_meg)/3.)), :]  # FIXME?
    assert_true((len(events_meg) % 3) == 0.)
    # delete trials absents from meg triggers:
    sel = np.where(events_behavior[event_type + '_missing'] == False)[0]
    # print sel
    events_behavior = events_behavior.iloc[sel]
    events_behavior.reset_index()

    # Returns specific types of events (Target, Cue or Probe)
    start = np.where([event_type == ii for ii in events_type_labels])[0][0]
    events_meg = events_meg[start::3, :]

    # check that same number of trials in MEG and behavior
    assert_true(len(events_meg) == len(events_behavior))

    events_behavior['meg_event_tsample'] = events_meg[:, 0]
    events_behavior['meg_file'] = events_meg[:, 1]
    events_behavior['meg_event_value'] = events_meg[:, 2]
    events_behavior = events_behavior.reset_index()
    return events_behavior
Exemplo n.º 38
0
from Levenshtein import editops
from collections import Counter
import sys

orgA = open(sys.argv[1]).read().strip()
orgB = open(sys.argv[2]).read().strip()

bases = Counter(orgA) + Counter(orgB)

ops = editops(orgA, orgB)
mutations = [(orgA[a], orgB[b]) for (t, a, b) in ops if t == 'replace']
transitions = [(a, b) for (a, b) in mutations if (a, b) == ('a', 'g') or (
    a, b) == ('g', 'a') or (a, b) == ('c', 't') or (a, b) == ('t', 'c')]
a_t = [(a, b) for (a, b) in mutations
       if (a, b) == ('a', 't') or (a, b) == ('t', 'a')]
g_c = [(a, b) for (a, b) in mutations
       if (a, b) == ('c', 'g') or (a, b) == ('g', 'c')]
a_clg_t = [(a, b) for (a, b) in mutations if (a, b) == ('a', 'c') or (
    a, b) == ('c', 'a') or (a, b) == ('c', 'g') or (a, b) == ('g', 'c')]
print('transitions', len(transitions) / len(orgA + orgB))
print('a<->t', len(a_t) / (bases['a'] + bases['t']))
print('g<->c', len(g_c) / (bases['g'] + bases['c']))
print('a<->c|g<->t', len(a_clg_t) / len(orgA + orgB))
            minimum_distance = ld
    return best_alternative


def resolve_variations(alternating, refstr):
    while True:
        resolved = resolve_single_variation(alternating, refstr)
        if resolved == False:
            return alternating
        alternating = resolved


str2 = resolve_variations(str2, str1)
fh_disambiguated.write(str2)

eops = editops(str1, str2)

stash = {
    str1: {
        'i': 0,
        'l': 0
    },
    str2: {
        'i': 0,
        'l': 0
    },
}


def get_line(offset, text):
    global stash