Пример #1
0
def choix_rememoration_index(Bs, indice, couple, indexation):
    #search and return the best case with index rememoration
    result = [couple[0, 0], couple[0, 1]]
    index = 0
    for i in range(indice):
        trouve = False
        j = 0
        while trouve == False and j < 3:
            d_incA = dist_inclusion(indexation[index, j], Bs)
            d_incB = dist_inclusion(indexation[i, j], Bs)
            #print result[0],j, d_incA, d_incB,indexation[index,j], indexation[i,j],'\t', couple[i,0]
            if j == 2 and d_incA == d_incB:
                init_memo_fast_distance(Bs)
                dist_srcA = memo_fast_distance(couple[index, 0])
                dist_srcB = memo_fast_distance(couple[i, 0])
                if dist_srcB < dist_srcA:
                    result = [couple[i, 0], couple[i, 1]]
                    index = i
            else:
                if d_incB != 0 and d_incA != 0:
                    if d_incB < d_incA:
                        result = [couple[i, 0], couple[i, 1]]
                        index = i
                        trouve = True
                    else:
                        if d_incA < d_incB:
                            trouve = True
            j += 1
    return result
Пример #2
0
def direct_iteration_strategy(self, string=None, strategy='naive'):
	"""
	If no Bs, then just iterate over the keys in the order of the dictionary.
	Else, apply the stategy selected to enumerate the source sentences in the bicorpus.
	There are 3 possible strategies implemented:
		naive: no sort is performed.
				The sentences are just enumerated in the order
				in which they appear in the bicorpus.
		by distance: the sentences are enumerated by increasing
				distance to the sentence to be translated.
				The LCS distance is used.
		by similitude: the sentences are enumerated by decreasing
				similarity with the sentence to be translated.
				Similitude is the length of the longest common
				subsequence (LCS).
	"""
	Bs = string
	if __verbose__: print >> sys.stderr, 'Bs = "%s", strategy = %s' % (Bs, strategy)
	if Bs == None or strategy == 'naive':
		result = self.keys()
	else:
		init_memo_fast_distance(Bs)
		if strategy == 'by distance':
			result = sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs))
		elif strategy == 'by similitude':
			result = sorted(self.keys(), key=lambda Xs: memo_fast_similitude(Xs), reverse=True)
	for As in result:
		A = As.rstrip('\n').split('\t')
		if memo_fast_distance(As) == 0:
			yield As, self[As]
			exit(0)
		
		yield As, self[As]
Пример #3
0
def sort_by_median_ratio(strings):
    """
	Sort the strings in a set of strings, median strings first.
	The combined edit distance with all strings in the set is used.
	>>> sort_by_median_ratio(['a', 'ab', 'abcd', 'abcdef'])
	['abcd', 'ab', 'a', 'abcdef']
	>>> sort_by_median_ratio(['a : a', 'aa : aa', 'aaaa : aaaa'])
	['aa : aa', 'a : a', 'aaaa : aaaa']
	>>> sort_by_median_ratio(['a : aa', 'aa : aaa', 'aaa : aaaa', 'aaaa : aaaaa', 'aaaaa : aaaaaa'])[0]
	'aaa : aaaa'
	>>> sort_by_median_ratio(['', 'go', 'brew', 'study' , 'overlook', 'understand'])
	['', 'go', 'brew', 'study', 'overlook', 'understand']
	"""
    dist = collections.defaultdict(int)
    # If strings contains too many strings,
    # shuffling the string and then considering only the first 100 members
    # is the same as taking a sample of 100 members.
    # Caution: this introduces randomness,
    # and thus the results may not be the same for two subsequent runs of the program.
    random.shuffle(strings)
    # If strings contains too many strings,
    # we compare each member of strings
    # to only a sample of 100 other members.
    for A in strings[:100]:
        init_memo_fast_distance(A)
        for B in strings:
            dist[B] += memo_fast_distance(B)
    sum_val, length = sum(dist.values()), len(dist)
    # Equivalent to take the average of all similarities and
    # sort by closeness to average similarity.
    avg_dist = dict((key, dist[key]) for key in dist)
    result = sorted(avg_dist, key=avg_dist.get)
    if __visualization__ and __trace__:
        visualize(dist, NlgSymbols.conformity.join(result[:2]))
    return result
Пример #4
0
def indirect_iteration_strategy(self, string=None, strategy='naive'):
    """
	If no Bs, then just output all triples of sentences in the order
		of the bicorpus.
	Else, apply the stategy selected to enumerate the source sentences in the bicorpus.
	There are 2 possible strategies implemented:
		naive: no sort is performed.
				Just output all triples of sentences in the order
				of the bicorpus.
		by distance: the As, Bs and Cs are enumerated by increasing
				distance to the sentence to be translated.
	"""
    Ds = string
    if __verbose__:
        print >> sys.stderr, 'Ds = "%s", strategy = %s' % (Ds, strategy)
    if Ds == None or strategy == 'naive':
        for triple in itertools.product(self, repeat=3):
            yield triple
    else:
        init_memo_fast_distance(Ds)
        if strategy == 'by distance':
            init_memo_fast_distance(Ds)
            closest_As = sorted(self.keys(),
                                key=lambda Xs: memo_fast_distance(Xs))[:100]
            for triple in sorted(itertools.product(closest_As, repeat=3)):
                As, Bs, Cs = triple
                if As != Bs and As != Cs and Bs != Cs:
                    if __verbose__:
                        print >> sys.stderr, '# {} : {} :: {} : {}'.format(
                            As, Bs, Cs, Ds)
                    yield triple
        elif strategy == 'closest':
            first_N = 3
            init_memo_fast_distance(Ds)
            result = sorted(self.keys(), key=lambda Xs: memo_fast_distance(Xs))
            print 'RESULT', result, '\n'
            for Bs in sorted(self.keys(),
                             key=lambda Xs: memo_fast_distance(Xs))[:first_N]:
                print Bs
                init_memo_fast_distance(Bs)
                for As in sorted(self.keys(),
                                 key=lambda Xs: len(Bs) - memo_fast_similitude(
                                     Xs))[:first_N]:
                    if __verbose__:
                        print >> sys.stderr, '# {} : {} :: {} : x'.format(
                            Bs, As, Ds)
                    CCs = solvenlg(Bs, As, Ds)
                    if CCs != None:
                        if __verbose__:
                            print >> sys.stderr, '# {} : {} :: {} : {}'.format(
                                Bs, As, Ds, CCs)
                        init_memo_fast_distance(CCs)
                        for Cs in sorted(self.keys(),
                                         key=lambda Xs: memo_fast_distance(Xs)
                                         )[:first_N]:
                            print Cs
                            if __verbose__:
                                print >> sys.stderr, '# {} : {} :: {} : {}'.format(
                                    Bs, As, Ds, Cs)
                            yield (As, Bs, Cs)
Пример #5
0
	def cluster_to_matrix(self):
		"""
		Builds a matrix representing the consistency of distances between ratios in a cluster.
		If d(A_i,B_i) == d(dA_j, B_j)
			then we fill the cell (i, j) in the matrix with a 0,
			else with a 1.
		"""
		length = len(self)
		labels = [ (NlgSymbols.ratio).join(ratio) for ratio in self ]
		matrix = [ [ 1 for j in xrange(length) ] for i in xrange(length) ]
		for i, [A, B] in enumerate(self):
			dA = dict()
			init_memo_fast_distance(A)
			matrix[i][i] = 0
			for C, _ in self[i+1:]:
				dA[C] = memo_fast_distance(C)
			init_memo_fast_distance(B)
			for xj, [C, D] in enumerate(self[i+1:]):
				j = i + 1 + xj
				dBD = memo_fast_distance(D)
				if __trace__: print >> sys.stderr, '# %s : %s :: %s : %s, d(%s, %s) = %d %s %d = d(%s, %s)' % \
						(A, B, C, D, A, C, dA[C], '==' if dA[C]==dBD else '=/=', dBD, B, D)
				if (dA[C] == dBD):	 # For x == False, this will be: dAC != dBD
					matrix[i][j] = matrix[j][i] = 0
		return SquareMatrix(matrix, labels=labels, visualization=__visualization__)
Пример #6
0
def correct(bicorpus, sentence=False, file=sys.stdin):
    if sentence != False:
        tab = [sentence]
        file = tab


#	bidictionary = bicorpus
    for Bs in file:
        string = sentence
        Bs = Bs.rstrip('\n')
        if __verbose__:
            print >> sys.stderr, '\n# Translating sentence: {}'.format(Bs)
        #		for As in bicorpus:
        if sentence == False: string = Bs
        # niveau d'index
        k = 3
        indice = 0
        indexation = {}
        couple = {}
        for As in bicorpus.iter(string,
                                strategy='by distance',
                                method='direct'):
            init_memo_fast_distance(Bs)
            #			Case where the sentence is already in the case base
            dist = memo_fast_distance(As[0])
            if dist == 0:
                print '{}'.format(As[1])
                sys.exit(0)
            else:

                a_s, b_s, c_s, e_s, pos, pos_em = single_correction(
                    As[0], Bs, As[1])
                Bt = a_s + b_s + c_s
                dist_cible = memo_fast_distance(Bt)
                if dist_cible != 0:
                    phrase = e_s
                    indexation[indice, 0] = phrase
                    couple[indice, 0] = As[0]
                    couple[indice, 1] = As[1]
                    #start to 1, 0 is the substring to replace
                    for i in range(1, k):
                        phrase = rememoration_index(As[0], phrase, pos_em)
                        indexation[indice, i] = phrase
                    indice += 1

        if indice > 0:
            result = choice_rememoration_index(Bs, indice, couple, indexation,
                                               k)

            a_s, b_s, c_s, e_s, pos, pos_em = single_correction(
                result[0], Bs, result[1])
            Bt = a_s + b_s + c_s
            print '{}'.format(Bt)
Пример #7
0
    def normalize(self):
        """
		Exchange As and Bs so that As are smaller than Bs.
		"""
        if self.is_normalized: return
        A, B = self[0][0], self[0][1]
        if len(B) < len(A):
            self[:] = [[ratio[1], ratio[0]] for ratio in self]
        if 2 == len(self):
            A, B, C = self[0][0], self[0][1], self[1][0]
            init_memo_fast_distance(A)
            if memo_fast_similitude(B) < memo_fast_similitude(C):
                self[0][1], self[1][0] = self[1][0], self[0][1]
        self.is_normalized = True
Пример #8
0
	def gamma_hypothesis(self):
		# Checking the gamma hypothesis: \gamma(A,B,C,D) == \gamma(B,A,D,C) == \gamma(C,D,A,B) == \gamma(D,C,B,A)
		# Y. Lepage, De l'analogie rendant compte..., thèse d'habilitation, 2003, p. 145--147.
		# Seems to be always verified by clusters output by this program.
		if __verbose__: print >> sys.stderr, '# Checking gamma constraint...'
		if len(cluster) == 2:
			A, B, C, D = cluster[0][0], cluster[0][1], cluster[1][0], cluster[1][1]
			lenA, lenB, lenC, lenD = len(A), len(B), len(C), len(D)
			init_memo_fast_distance(A)
			dAB, dAC = memo_fast_distance(B), memo_fast_distance(C)
			sAB, sAC = lenA + lenB - 2 * dAB, lenA + lenC - 2 * dAC
			init_memo_fast_distance(D)
			dDB, dDC = memo_fast_distance(B), memo_fast_distance(C)
			sDB, sDC = lenD + lenB - 2 * dDB, lenD + lenC - 2 * dDC
			gammaA, gammaB, gammaC, gammaD = sAB + sAC - lenA, sAB + sDB - lenB, sAC + sDC - lenC, sDB + sDC - lenD
			if gammaA == gammaB == gammaC == gammaD:
				yield cluster
		else:
			yield cluster
Пример #9
0
def choice_rememoration_index(Bs, indice, couple, indexation, k):
    """
	input: 
		Bs = sentence to correct
		indice = size of elements to compare
		couple = source problem and his solution
		indexation = list of string get by method rememoration_index
		k = index level
	output: chosen couple to apply correction
	"""
    #search and return the best case with index rememoration
    result = [couple[0, 0], couple[0, 1]]
    index = 0
    for i in range(indice):
        trouve = False
        j = 0
        while trouve == False and j < k:
            d_incA = dist_inclusion(indexation[index, j], Bs)
            d_incB = dist_inclusion(indexation[i, j], Bs)
            if j == 2 and d_incA == d_incB:
                init_memo_fast_distance(Bs)
                dist_srcA = memo_fast_distance(couple[index, 0])
                dist_srcB = memo_fast_distance(couple[i, 0])
                if dist_srcB < dist_srcA:
                    result = [couple[i, 0], couple[i, 1]]
                    index = i
            else:
                if d_incB != 0 and d_incA != 0:
                    if d_incB < d_incA:
                        result = [couple[i, 0], couple[i, 1]]
                        index = i
                        trouve = True
                    else:
                        if d_incA < d_incB:
                            trouve = True
            j += 1
    return result
Пример #10
0
def init_memo_fast_distance(word1):
    return _fast_distance.init_memo_fast_distance(word1)
Пример #11
0
def translate(bicorpus, sentence = False, file=sys.stdin):
	"""
	input: Bs, a sentence in the source language
	output: list of Bt, sentences in the target language, candidate translations
	data (passed as arguments):
		bicorpus = list of pairs (As, At) where At is the translation of As.
		bidictionary = bilingual dictionary of (a_s, a_t) where a_s is a word, and a_t its translation.
	"""
	if sentence != False: 
		tab = [sentence]
		file = tab
#	bidictionary = bicorpus
	for Bs in file:
		#compteur de la plus basse distance entre la chaine et les cas dans le dictionnaire
		dist = sys.maxint
		super_dist = sys.maxint
		Bs = Bs.rstrip('\n')
		if __verbose__: print >> sys.stderr, '\n# Translating sentence: {}'.format(Bs)
#		for As in bicorpus:
		string = sentence
		if sentence == False: string = Bs
		# niveau d'index
		k = 6
		indice = 0
		indexation = {}
		couple = {}
		for As in bicorpus.iter(string, strategy='by distance', method='direct'):
			init_memo_fast_distance(Bs)
#			Case where the sentence is already in the case base
			dist = memo_fast_distance(As[0])
			if  dist == 0:
				print '{}\t{}', Bs,'\t',As[1]
				sys.exit(0)
			else :
				a_s, b_s, c_s, e_s, pos, pos_em = single_correction(As[0], Bs, As[1])
				Bt = a_s+b_s+c_s
				dist_cible = memo_fast_distance(Bt)
				print As[0]
				if dist_cible != 0:
					phrase = e_s
					indexation[indice,0] = phrase
					couple[indice,0] = As[0]
					couple[indice,1] = As[1]
					for i in range(1,k):
						phrase = rememoration_index(As[0], phrase, pos_em)
						indexation[indice,i] = phrase
						"""
						if As[0] == 'Je suis sur Nancy.':
							print phrase, pos_em
						"""
					indice += 1
		#print indexation
		if indice > 0:
			result = [couple[0,0], couple[0,1]]
			index = 0
			for i in range(indice):
				#print couple[i,0]
				trouve = False
				j = 0
				while trouve == False and j < 3:
					d_incA = dist_inclusion(indexation[index,j], Bs) 
					d_incB = dist_inclusion(indexation[i,j], Bs) 
					#print result[0],j, d_incA, d_incB,indexation[index,j], indexation[i,j],'\t', couple[i,0]
					if  j == 2 and d_incA == d_incB:
						init_memo_fast_distance(Bs)
						dist_srcA = memo_fast_distance(couple[index,0])
						dist_srcB = memo_fast_distance(couple[i,0])
						if dist_srcB < dist_srcA:
							result = [couple[i,0], couple[i,1]]
							index = i
					else :
						if d_incB != 0 and d_incA != 0:
							if d_incB < d_incA:
									result = [couple[i,0], couple[i,1]]
									index = i
									trouve = True
							else: 
								if d_incA < d_incB:
									trouve = True
					j += 1
			#print result[0], result[1]
			a_s, b_s, c_s, e_s, pos, pos_em = single_correction(result[0], Bs, result[1])
			Bt = a_s+b_s+c_s
			print Bt