示例#1
0
文件: patcher.py 项目: goavki/patcher
	def __init__(self, apertium, first_source_sentence, second_source_sentence, target_sentence, 
		caching=False, cache_db_file=':memory:'):		
		self.apertium = apertium
		self.s_sentence = first_source_sentence.lower()
		self.s1_sentence = second_source_sentence.lower()
		self.t_sentence = target_sentence.lower()
		self.caching = caching
		if caching:
			self.cacher = Cacher(apertium.s_lang, apertium.t_lang, cache_db_file)
示例#2
0
文件: patcher.py 项目: goavki/patcher
class Patcher(object):
	"""
		Patches the strings passed.
	"""
	def __init__(self, apertium, first_source_sentence, second_source_sentence, target_sentence, 
		caching=False, cache_db_file=':memory:'):		
		self.apertium = apertium
		self.s_sentence = first_source_sentence.lower()
		self.s1_sentence = second_source_sentence.lower()
		self.t_sentence = target_sentence.lower()
		self.caching = caching
		if caching:
			self.cacher = Cacher(apertium.s_lang, apertium.t_lang, cache_db_file)

	def _do_edit_distace_alignment(self, min_len, max_len):
		#Do edit distance alignment
		phrase_extractor = PhraseExtractor(self.s_sentence, self.s1_sentence, min_len, max_len)
		self.phrases = phrase_extractor.extract_pairs()
		self.src_mismatches, self.tgt_mismatches = phrase_extractor.find_non_alignments()

	def _check_for_all_mismatches(self, cs, cs1):
		for sm in self.src_mismatches:
			for (a,b) in cs:
				if a <= sm <= b:
					break
			else:
				return False
		for tm in self.tgt_mismatches:
			for (a,b) in cs1:
				if a <= tm <= b:
					break
			else:
				return False
		return True

	def _update_coverings(self, sigma, sigma1, cs, cs1):
		cs.append(sigma)
		cs1.append(sigma1)
		return cs, cs1

	def _do_translations(self, dir=None):
		S = self.s_sentence.split()
		S1 = self.s1_sentence.split()
		
		src = ""
		src1 = ""
		self.mismatches_map = {}
		self.src_trans_map = {}
		self.src_trans_map1 = {}
		could_be_done_from_caching = True

		for a,b,c,d in self.phrases:
			try:
				self.mismatches_map[(a,b)].append((c,d))
			except KeyError:
				self.mismatches_map[(a,b)] = [(c,d)]

		if self.caching:
			tgt_segments, tgt1_segments = [], []
			for a,b,c,d in self.phrases:
				str1 = ' '.join(S[a: b+1])
				str2 = ' '.join(S1[c: d+1])
				
				tgt1 = self.cacher.retrieve(str1)
				tgt2 = self.cacher.retrieve(str2)
				
				if not (tgt1 and tgt2):
					could_be_done_from_caching = False
					break
				tgt_segments.append(tgt1[0])
				tgt1_segments.append(tgt2[0])

			if could_be_done_from_caching:
				for (x, t, t1) in zip(self.phrases, tgt_segments, tgt1_segments):
					(a,b,c,d) = x
					self.src_trans_map[(a,b)] = t
					self.src_trans_map1[(c,d)] = t1

		if not self.caching or not could_be_done_from_caching:
			for a,b,c,d in self.phrases:
				str1 = ' '.join(S[a: b+1])
				str2 = ' '.join(S1[c: d+1])

				src += str1 + '.|'
				src1 += str2 + '.|'

			src_combined = src+'.||.'+src1

			#Get translations for segments.
			(out, err) = self.apertium.translate(src_combined, dir)
			# print(out, err)
			assertion(err == '', "Apertium error: "+err)
			(out, out1) = out.split('.||.')

			tgt_segments = out.split('.|')
			tgt1_segments = out1.split('.|')

			for (x, t, t1) in zip(self.phrases, tgt_segments[:-1], tgt1_segments[:-1]):
				(a,b,c,d) = x
				self.src_trans_map[(a,b)] = t
				self.src_trans_map1[(c,d)] = t1
				if self.caching:
					str1 = ' '.join(S[a: b+1])
					str2 = ' '.join(S1[c: d+1])
					try:
						self.cacher.insert(str1, t)
						self.cacher.insert(str2, t1)
					except Exception:
						pass

	def _do_patching(self, t_app, tau, tau1, covered_pos, grounded_only):
		(a,b) = tau
		t_app = t_app.split()

		if(any(a<=c<=b for c in covered_pos)):
			return None, None

		seg = ' '.join(t_app[a:b+1])
		seg_left = ' '.join(t_app[:a])
		seg_right = ' '.join(t_app[b+1:])

		if grounded_only:
			pe = PhraseExtractor(seg.lower(), tau1.lower())
			aligns = pe.find_alignments()
			if aligns == []:
				return None, None
			p = min(a[0] for a in aligns)
			q = max(a[0] for a in aligns)
			r = min(a[1] for a in aligns)
			s = max(a[1] for a in aligns)
			if p == q or r ==s or p != 0 or q != (b-a) or r != 0 or s != len(tau1.split())-1 :
				return None, None
		
		seg = tau1.split()
		
		pe = PhraseExtractor(' '.join(t_app[a:b+1]).lower(), tau1.lower())
		aligns = pe.find_alignments()
		
		tg_aligns = [x for (_, x) in aligns]
		cp = [a+i for i in range(len(seg)) if i not in tg_aligns]
		cp += covered_pos
		# print(cp)

		if seg_left != '':
			tau1 = tau1.lower()
		return (seg_left + ' ' + tau1 + ' ' + seg_right).strip(), cp
		
	def _covers_mismatch(self, sigma):
		return sigma in self.mismatches_map.keys() 

	def get_best_patch(self, cam=False):
		"""Returns the best possible patch based upon the overlap"""
		if not cam:
			return self._best_patch
		if self._best_patch:
			self._s_set.append(self._best_patch)
		return self._find_best_patch(cam)

	def _find_best_patch(self, cam=False):
		max_sum_of_sigmas = -1
		best_patch = None
		for patch in self._s_set:
			(_, _, _, sc, sc1, cm, _) = patch
			if not cam:
				sum_of_sigmas = sum([(b-a) for (a,b) in sc])
				sum_of_sigmas += sum([(b-a) for (a,b) in sc1])
				if sum_of_sigmas > max_sum_of_sigmas:
					best_patch = patch
					max_sum_of_sigmas = sum_of_sigmas
			elif cam and cm:
				sum_of_sigmas = sum([(b-a) for (a,b) in sc])
				sum_of_sigmas += sum([(b-a) for (a,b) in sc1])
				if sum_of_sigmas > max_sum_of_sigmas:
					best_patch = patch
					max_sum_of_sigmas = sum_of_sigmas
		return best_patch

	def patch(self, min_len=2, max_len=5, grounded_only=False, dir=None):
		"""Does the actual patching."""
		self._do_edit_distace_alignment(min_len, max_len)
		self._do_translations(dir)

		S = self.s_sentence.split()
		S1 = self.s1_sentence.split()
		TS = self.t_sentence.split()
		s_set = [(self.t_sentence, "unpatched", [], [], [], False, [])]	#[] for maintaing which words are changed	

		p = 0 							#Indexing begins with 0
		while p <= len(S):
			for j in range(max([0, p-max_len]), p-min_len+1):
				sigma = (j, p-1)	
				if not self._covers_mismatch(sigma):	#Covers mismatch
					continue
				y = self.src_trans_map[sigma]	#No need for 'for' now
				T = get_subsegment_locs(y, self.t_sentence)
				
				if T != []:					#if y is not found in t
					for sigma1 in self.mismatches_map[sigma]:	#Source aligns
						for tau in T:
							tau1 = self.src_trans_map1[sigma1]	#No need for another 'for' now
							s_set_temp = []
							for (t1, features, covered, cs, cs1, c_all, traces) in s_set:
								if c_all:	#Covers all mismatch
									continue
								t1_new, covered_new = self._do_patching(t1, tau, tau1, covered[:], grounded_only)
								if t1_new != None:
									features = get_features(p, sigma, self.src_mismatches, t1_new, t1, tau)
									cs, cs1 = self._update_coverings(sigma, sigma1, cs[:], cs1[:])
									cam = self._check_for_all_mismatches(cs, cs1)
									new_traces = traces[:]
									new_traces.append(
										(' '.join(S[sigma[0]:sigma[1]+1]).strip().lower(), 
												' '.join(S1[sigma1[0]:sigma1[1]+1]).strip().lower(), 
												' '.join(TS[tau[0]:tau[1]+1]).strip().lower())
									)
									s_set_temp.append((t1_new, features, covered_new, cs, cs1, cam, new_traces))
							s_set += s_set_temp
			p += 1
		if grounded_only:
			s_set.pop(0)
		self._s_set = s_set
		self._best_patch = self._find_best_patch()
		if self._best_patch:
			s_set.remove(self._best_patch)
		return s_set