def choix_rememoration_index(Bs, indice, couple, indexation): #search and return the best case with index rememoration result = [couple[0, 0], couple[0, 1]] index = 0 for i in range(indice): trouve = False j = 0 while trouve == False and j < 3: d_incA = dist_inclusion(indexation[index, j], Bs) d_incB = dist_inclusion(indexation[i, j], Bs) #print result[0],j, d_incA, d_incB,indexation[index,j], indexation[i,j],'\t', couple[i,0] if j == 2 and d_incA == d_incB: init_memo_fast_distance(Bs) dist_srcA = memo_fast_distance(couple[index, 0]) dist_srcB = memo_fast_distance(couple[i, 0]) if dist_srcB < dist_srcA: result = [couple[i, 0], couple[i, 1]] index = i else: if d_incB != 0 and d_incA != 0: if d_incB < d_incA: result = [couple[i, 0], couple[i, 1]] index = i trouve = True else: if d_incA < d_incB: trouve = True j += 1 return result
def indirect_iteration_strategy(self, string=None, strategy='naive'): """ If no Bs, then just output all triples of sentences in the order of the bicorpus. Else, apply the stategy selected to enumerate the source sentences in the bicorpus. There are 2 possible strategies implemented: naive: no sort is performed. Just output all triples of sentences in the order of the bicorpus. by distance: the As, Bs and Cs are enumerated by increasing distance to the sentence to be translated. """ Ds = string if __verbose__: print >> sys.stderr, 'Ds = "%s", strategy = %s' % (Ds, strategy) if Ds == None or strategy == 'naive': for triple in itertools.product(self, repeat=3): yield triple else: init_memo_fast_distance(Ds) if strategy == 'by distance': init_memo_fast_distance(Ds) closest_As = sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs))[:100] for triple in sorted(itertools.product(closest_As, repeat=3)): As, Bs, Cs = triple if As != Bs and As != Cs and Bs != Cs: if __verbose__: print >> sys.stderr, '# {} : {} :: {} : {}'.format( As, Bs, Cs, Ds) yield triple elif strategy == 'closest': first_N = 3 init_memo_fast_distance(Ds) result = sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs)) print 'RESULT', result, '\n' for Bs in sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs))[:first_N]: print Bs init_memo_fast_distance(Bs) for As in sorted(self.keys(), key=lambda Xs: len(Bs) - memo_fast_similitude( Xs))[:first_N]: if __verbose__: print >> sys.stderr, '# {} : {} :: {} : x'.format( Bs, As, Ds) CCs = solvenlg(Bs, As, Ds) if CCs != None: if __verbose__: print >> sys.stderr, '# {} : {} :: {} : {}'.format( Bs, As, Ds, CCs) init_memo_fast_distance(CCs) for Cs in sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs) )[:first_N]: print Cs if __verbose__: print >> sys.stderr, '# {} : {} :: {} : {}'.format( Bs, As, Ds, Cs) yield (As, Bs, Cs)
def cluster_to_matrix(self): """ Builds a matrix representing the consistency of distances between ratios in a cluster. If d(A_i,B_i) == d(dA_j, B_j) then we fill the cell (i, j) in the matrix with a 0, else with a 1. """ length = len(self) labels = [ (NlgSymbols.ratio).join(ratio) for ratio in self ] matrix = [ [ 1 for j in xrange(length) ] for i in xrange(length) ] for i, [A, B] in enumerate(self): dA = dict() init_memo_fast_distance(A) matrix[i][i] = 0 for C, _ in self[i+1:]: dA[C] = memo_fast_distance(C) init_memo_fast_distance(B) for xj, [C, D] in enumerate(self[i+1:]): j = i + 1 + xj dBD = memo_fast_distance(D) if __trace__: print >> sys.stderr, '# %s : %s :: %s : %s, d(%s, %s) = %d %s %d = d(%s, %s)' % \ (A, B, C, D, A, C, dA[C], '==' if dA[C]==dBD else '=/=', dBD, B, D) if (dA[C] == dBD): # For x == False, this will be: dAC != dBD matrix[i][j] = matrix[j][i] = 0 return SquareMatrix(matrix, labels=labels, visualization=__visualization__)
def direct_iteration_strategy(self, string=None, strategy='naive'): """ If no Bs, then just iterate over the keys in the order of the dictionary. Else, apply the stategy selected to enumerate the source sentences in the bicorpus. There are 3 possible strategies implemented: naive: no sort is performed. The sentences are just enumerated in the order in which they appear in the bicorpus. by distance: the sentences are enumerated by increasing distance to the sentence to be translated. The LCS distance is used. by similitude: the sentences are enumerated by decreasing similarity with the sentence to be translated. Similitude is the length of the longest common subsequence (LCS). """ Bs = string if __verbose__: print >> sys.stderr, 'Bs = "%s", strategy = %s' % (Bs, strategy) if Bs == None or strategy == 'naive': result = self.keys() else: init_memo_fast_distance(Bs) if strategy == 'by distance': result = sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs)) elif strategy == 'by similitude': result = sorted(self.keys(), key=lambda Xs: memo_fast_similitude(Xs), reverse=True) for As in result: A = As.rstrip('\n').split('\t') if memo_fast_distance(As) == 0: yield As, self[As] exit(0) yield As, self[As]
def correct(bicorpus, sentence=False, file=sys.stdin): if sentence != False: tab = [sentence] file = tab # bidictionary = bicorpus for Bs in file: string = sentence Bs = Bs.rstrip('\n') if __verbose__: print >> sys.stderr, '\n# Translating sentence: {}'.format(Bs) # for As in bicorpus: if sentence == False: string = Bs # niveau d'index k = 3 indice = 0 indexation = {} couple = {} for As in bicorpus.iter(string, strategy='by distance', method='direct'): init_memo_fast_distance(Bs) # Case where the sentence is already in the case base dist = memo_fast_distance(As[0]) if dist == 0: print '{}'.format(As[1]) sys.exit(0) else: a_s, b_s, c_s, e_s, pos, pos_em = single_correction( As[0], Bs, As[1]) Bt = a_s + b_s + c_s dist_cible = memo_fast_distance(Bt) if dist_cible != 0: phrase = e_s indexation[indice, 0] = phrase couple[indice, 0] = As[0] couple[indice, 1] = As[1] #start to 1, 0 is the substring to replace for i in range(1, k): phrase = rememoration_index(As[0], phrase, pos_em) indexation[indice, i] = phrase indice += 1 if indice > 0: result = choice_rememoration_index(Bs, indice, couple, indexation, k) a_s, b_s, c_s, e_s, pos, pos_em = single_correction( result[0], Bs, result[1]) Bt = a_s + b_s + c_s print '{}'.format(Bt)
def sort_by_median_ratio(strings): """ Sort the strings in a set of strings, median strings first. The combined edit distance with all strings in the set is used. >>> sort_by_median_ratio(['a', 'ab', 'abcd', 'abcdef']) ['abcd', 'ab', 'a', 'abcdef'] >>> sort_by_median_ratio(['a : a', 'aa : aa', 'aaaa : aaaa']) ['aa : aa', 'a : a', 'aaaa : aaaa'] >>> sort_by_median_ratio(['a : aa', 'aa : aaa', 'aaa : aaaa', 'aaaa : aaaaa', 'aaaaa : aaaaaa'])[0] 'aaa : aaaa' >>> sort_by_median_ratio(['', 'go', 'brew', 'study' , 'overlook', 'understand']) ['', 'go', 'brew', 'study', 'overlook', 'understand'] """ dist = collections.defaultdict(int) # If strings contains too many strings, # shuffling the string and then considering only the first 100 members # is the same as taking a sample of 100 members. # Caution: this introduces randomness, # and thus the results may not be the same for two subsequent runs of the program. random.shuffle(strings) # If strings contains too many strings, # we compare each member of strings # to only a sample of 100 other members. for A in strings[:100]: init_memo_fast_distance(A) for B in strings: dist[B] += memo_fast_distance(B) sum_val, length = sum(dist.values()), len(dist) # Equivalent to take the average of all similarities and # sort by closeness to average similarity. avg_dist = dict((key, dist[key]) for key in dist) result = sorted(avg_dist, key=avg_dist.get) if __visualization__ and __trace__: visualize(dist, NlgSymbols.conformity.join(result[:2])) return result
def gamma_hypothesis(self): # Checking the gamma hypothesis: \gamma(A,B,C,D) == \gamma(B,A,D,C) == \gamma(C,D,A,B) == \gamma(D,C,B,A) # Y. Lepage, De l'analogie rendant compte..., thèse d'habilitation, 2003, p. 145--147. # Seems to be always verified by clusters output by this program. if __verbose__: print >> sys.stderr, '# Checking gamma constraint...' if len(cluster) == 2: A, B, C, D = cluster[0][0], cluster[0][1], cluster[1][0], cluster[1][1] lenA, lenB, lenC, lenD = len(A), len(B), len(C), len(D) init_memo_fast_distance(A) dAB, dAC = memo_fast_distance(B), memo_fast_distance(C) sAB, sAC = lenA + lenB - 2 * dAB, lenA + lenC - 2 * dAC init_memo_fast_distance(D) dDB, dDC = memo_fast_distance(B), memo_fast_distance(C) sDB, sDC = lenD + lenB - 2 * dDB, lenD + lenC - 2 * dDC gammaA, gammaB, gammaC, gammaD = sAB + sAC - lenA, sAB + sDB - lenB, sAC + sDC - lenC, sDB + sDC - lenD if gammaA == gammaB == gammaC == gammaD: yield cluster else: yield cluster
def choice_rememoration_index(Bs, indice, couple, indexation, k): """ input: Bs = sentence to correct indice = size of elements to compare couple = source problem and his solution indexation = list of string get by method rememoration_index k = index level output: chosen couple to apply correction """ #search and return the best case with index rememoration result = [couple[0, 0], couple[0, 1]] index = 0 for i in range(indice): trouve = False j = 0 while trouve == False and j < k: d_incA = dist_inclusion(indexation[index, j], Bs) d_incB = dist_inclusion(indexation[i, j], Bs) if j == 2 and d_incA == d_incB: init_memo_fast_distance(Bs) dist_srcA = memo_fast_distance(couple[index, 0]) dist_srcB = memo_fast_distance(couple[i, 0]) if dist_srcB < dist_srcA: result = [couple[i, 0], couple[i, 1]] index = i else: if d_incB != 0 and d_incA != 0: if d_incB < d_incA: result = [couple[i, 0], couple[i, 1]] index = i trouve = True else: if d_incA < d_incB: trouve = True j += 1 return result
def memo_fast_distance(word2): return _fast_distance.memo_fast_distance(word2)
def translate(bicorpus, sentence = False, file=sys.stdin): """ input: Bs, a sentence in the source language output: list of Bt, sentences in the target language, candidate translations data (passed as arguments): bicorpus = list of pairs (As, At) where At is the translation of As. bidictionary = bilingual dictionary of (a_s, a_t) where a_s is a word, and a_t its translation. """ if sentence != False: tab = [sentence] file = tab # bidictionary = bicorpus for Bs in file: #compteur de la plus basse distance entre la chaine et les cas dans le dictionnaire dist = sys.maxint super_dist = sys.maxint Bs = Bs.rstrip('\n') if __verbose__: print >> sys.stderr, '\n# Translating sentence: {}'.format(Bs) # for As in bicorpus: string = sentence if sentence == False: string = Bs # niveau d'index k = 6 indice = 0 indexation = {} couple = {} for As in bicorpus.iter(string, strategy='by distance', method='direct'): init_memo_fast_distance(Bs) # Case where the sentence is already in the case base dist = memo_fast_distance(As[0]) if dist == 0: print '{}\t{}', Bs,'\t',As[1] sys.exit(0) else : a_s, b_s, c_s, e_s, pos, pos_em = single_correction(As[0], Bs, As[1]) Bt = a_s+b_s+c_s dist_cible = memo_fast_distance(Bt) print As[0] if dist_cible != 0: phrase = e_s indexation[indice,0] = phrase couple[indice,0] = As[0] couple[indice,1] = As[1] for i in range(1,k): phrase = rememoration_index(As[0], phrase, pos_em) indexation[indice,i] = phrase """ if As[0] == 'Je suis sur Nancy.': print phrase, pos_em """ indice += 1 #print indexation if indice > 0: result = [couple[0,0], couple[0,1]] index = 0 for i in range(indice): #print couple[i,0] trouve = False j = 0 while trouve == False and j < 3: d_incA = dist_inclusion(indexation[index,j], Bs) d_incB = dist_inclusion(indexation[i,j], Bs) #print result[0],j, d_incA, d_incB,indexation[index,j], indexation[i,j],'\t', couple[i,0] if j == 2 and d_incA == d_incB: init_memo_fast_distance(Bs) dist_srcA = memo_fast_distance(couple[index,0]) dist_srcB = memo_fast_distance(couple[i,0]) if dist_srcB < dist_srcA: result = [couple[i,0], couple[i,1]] index = i else : if d_incB != 0 and d_incA != 0: if d_incB < d_incA: result = [couple[i,0], couple[i,1]] index = i trouve = True else: if d_incA < d_incB: trouve = True j += 1 #print result[0], result[1] a_s, b_s, c_s, e_s, pos, pos_em = single_correction(result[0], Bs, result[1]) Bt = a_s+b_s+c_s print Bt
#!/usr/bin/python # -*- coding: utf-8 -*- from _fast_distance import init_memo_fast_distance, memo_fast_distance ############################################################################### __author__ = 'Yves Lepage <*****@*****.**>' __date__, __version__ = '29/08/2017', '1.0' # Creation. ############################################################################### words = open('tmp.wrd').readlines() d = dict() for i1, word1 in enumerate(words): if not word1.startswith('#'): if word1 not in d: d[word1] = dict() init_memo_fast_distance(word1.decode('UTF-8') for word2 in words[i1+1:]: d[word1][word2] = memo_fast_distance(word2.decode('UTF-8'))