class MisspellingCorrector: def __init__(self, infile, dict_file, needTraining=False): print "**************************************" self.needTraining = needTraining self.dictionary = sorted(cPickle.load(open(dict_file, 'rb'))) self.infile = infile self.train() def train(self): # Training self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True) if self.needTraining: lines = open(self.infile, 'r').readlines() # Generate Positive Correction Pair ppairs = [] ppairs = [ line.split('\t')[1].strip().split(' | ') for line in lines ] ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))] # Generate Positive Training Correction Pairs and Testing Correction Pairs ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1) self.ppairs_train = [ tuple(ppair_train) for ppair_train in ppairs_train ] self.ppairs_test = [ tuple(ppair_test) for ppair_test in ppairs_test ] # Generate Negative Training Correction Pairs incorrect = list(zip(*ppairs_train)[0]) shuffle(incorrect) correct = list(zip(*ppairs_train)[1]) npairs_train = zip(incorrect, correct) # Raw training set x_raw = ppairs_train + npairs_train # Label of the training set self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train) # Extract Features from the raw training set self.x_train = x_orig = self.fe.fit_transform(x_raw) #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42) self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None) self.m.fit(self.x_train, self.y_train, verbosity=20) cPickle.dump(self.m, open('Corrector.pkl', 'wb')) else: print "start training" self.m = cPickle.load(open('Corrector.pkl', 'rb')) print "finish training" def test(self): count = 0 for incorrect, correct in self.ppairs_test: # Get the top 100 candidats with smallest levenshtein distance test_pairs = [ (incorrect, candidate) for candidate in heapq.nsmallest( 100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x)) ] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if cr[1][1] == correct: count += 1 else: print(incorrect, correct), print cr[1][1] print print count / float(len(self.ppairs_test)) def correct(self, incorrect): test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest( 10, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) print pr cr = zip(pr, test_pairs) print cr # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if levenshtein.levenshtein(incorrect, cr[1][1]) > 2: return 'gopdebate' else: return cr[1][1]
class MisspellingCorrection: def __init__(self, infile): lines = open(infile, 'r').readlines() # Generate Positive Correction Pair ppairs = [] ppairs = [line.split('\t')[1].strip().split(' | ') for line in lines] ppairs = [(pair[0], pair[i]) for pair in ppairs for i in xrange(1, len(pair))] self.dictionary = [pair[i] for pair in ppairs for i in xrange(1, len(pair))] # Generate Positive Training Correction Pairs and Testing Correction Pairs ppairs_train, ppairs_test = train_test_split(ppairs, test_size=200, random_state=1) self.ppairs_train = [tuple(ppair_train) for ppair_train in ppairs_train] self.ppairs_test = [tuple(ppair_test) for ppair_test in ppairs_test] # Generate Negative Training Correction Pairs incorrect = list(zip(*ppairs_train)[0]) shuffle(incorrect) correct = list(zip(*ppairs_train)[1]) npairs_train = zip(incorrect, correct) # Raw training set x_raw = ppairs_train + npairs_train # Label of the training set self.y_train = [0] * len(ppairs_train) + [1] * len(npairs_train) # Extract Features from the raw training set self.fe = StringPairFeatureExtractor(match=True, numeric=True, transition=True) self.x_train = x_orig = self.fe.fit_transform(x_raw) #x_train, x_test, y_train, y_test = train_test_split(x_orig, y_orig, test_size=0.2, random_state=42) self.train() def train(self): # Training self.m = Hacrf(l2_regularization=10.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 45}, state_machine=None) self.m.fit(self.x_train, self.y_train, verbosity=20) def test(self): count = 0 for incorrect, correct in self.ppairs_test: # Get the top 100 candidats with smallest levenshtein distance test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if cr[1][1] == correct: count += 1 else: print (incorrect, correct), print cr[1][1] print print count/float(len(self.ppairs_test)) def correct(self, incorrect): test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) return cr[1][1]