def make_P_S(model, n_p, n_s, hard_stop=False, max_attempts=1e5, max_sample_length=np.inf): model = LanguageModel(model) P = set() S = set() attempts = 0 while (len(P) < n_p or len(S) < n_s) and attempts < max_attempts: attempts += 1 w = model.sample(cutoff=max_sample_length) if len( P ) < n_p: # to keep from making P bigger than expected when n_s>n_p for i in range(len(w) + 1): P.add(tuple(w[:i])) if hard_stop and len(P) >= n_p: break if len(S) < n_s: # same idea, when n_p>n_s for i in range(len(w), -1, -1): S.add(tuple(w[i:])) if hard_stop and len(S) >= n_s: break P = sorted([list(w) for w in P], key=len) S = sorted([list(w) for w in S], key=len) if len(P) < n_p or len(S) < n_s: print("attempted", max_attempts, "samples, each of max length", max_sample_length, ", could not get enough P/S but cutting short") return P, S
def __init__(self): CLEAN_TRAIN_AB_FILE = 'clean_corpus_train1ab.txt' INSULT_TRAIN_AB_FILE = 'insult_corpus_train1ab.txt' INSULT_TEST_FILE = 'insult_corpus_test.txt' CLEAN_TEST_FILE = 'clean_corpus_test.txt' self.punctuation = set([',', ';', '\'', '"', '.', '!', '?']) self.dictionary = enchant.Dict("en_US") #self.cleanTrainSents = LanguageModel(CLEAN_TRAIN_AB_FILE).getSents() #self.insultTrainSents = LanguageModel(INSULT_TRAIN_FILE).getSents() #self.cleanTestSents = LanguageModel(CLEAN_TEST_AB_FILE).getSents() #self.insultTestSents = LanguageModel(INSULT_TEST_FILE).getSents() self.cleanSplitSpaces = LanguageModel( CLEAN_TRAIN_AB_FILE).splitBySpaces() self.insultSplitSpaces = LanguageModel( INSULT_TRAIN_AB_FILE).splitBySpaces() self.cleanTestSplitSpaces = LanguageModel( CLEAN_TEST_FILE).splitBySpaces() self.insultTestSplitSpaces = LanguageModel( INSULT_TEST_FILE).splitBySpaces()
def load_language_model(self, fp): ''' Takes in a file pointer as input and should initialize the SpellChecker object’s language_model data member to a default LanguageModel and then load the stored language model (e.g. lm.pkl) from fp into that data member. ''' self.language_model = LanguageModel() self.language_model.load(fp)
def main(): precisions = [] recalls = [] for alpha in [0.3, 0.5, 0.7, 0.9, 0.95, 0.97, 0.99, 1.00, 1.01, 1.03, 1.05, 1.07, 1.1, 1.3, 1.5, 1.7, 2.0, 3.0, 5.0]: ALPHA = alpha cleanLM = LanguageModel(CLEAN_TRAIN_FILE) insultLM = LanguageModel(INSULT_TRAIN_FILE) cleanTestSents = LanguageModel(CLEAN_TEST_FILE).getSents() insultTestSents = LanguageModel(INSULT_TEST_FILE).getSents() NB = baselineNaiveBayes(cleanLM, insultLM) NB.train() #print NB.genProbs(cleanTestSents, insultTestSents) if (STUPID_BACKOFF): tp, tn, fp, fn = NB.testStupidBackoff(cleanTestSents, insultTestSents, ALPHA) else: tp, tn, fp, fn = NB.testImproved1(cleanTestSents, insultTestSents, ALPHA) interpretResults(tp, tn, fp, fn) print "Precisions:\n {}".format(precisions) print "Recalls:\n {}".format(recalls)
def get_ndcg_samples_and_target(): lm = LanguageModel(rnn) prefs = [] while len(prefs) < args.ndcg_num_samples: s = lm.sample(cutoff=args.ndcg_max_len) prefs += [s[:i] for i in range(len(s) + 1)] prefs = prefs[:args. ndcg_num_samples] # remove extra ones possibly added by last sequence, just in name of easy reporting honestly prefs = list(prefs) with open(rnn_folder + "/ndcg_samples.txt", "w") as f: print(len(prefs), len(lm.input_alphabet), file=f) for p in prefs: print( len(p), " ".join([str(t) for t in p]), file=f ) # this is fine for the spices and for the uhls, where the tokens are ints. make sure to read it right too! target_filename = rnn_folder + "/ndcg_target.txt" with open(target_filename, "w") as f: print(args.ndcg_k, file=f) # store what ndcg_k is being made for p in prefs: d = lm.distribution_from_sequence(p) chars = sorted(list(d.keys()), key=lambda x: d[x], reverse=True) optimal = np.sum([ d[c] / np.log2(i + 2) for i, c in enumerate(chars[:args.ndcg_k]) ]) #log2(i+2): ndcg wants i+1 where i is token index, but also remember enumerate starts from zero chars_weights = [ v for pair in [(c, d[c]) for c in chars] for v in pair ] chars_weights = [(v if not v == lm.end_token else -1) for v in chars_weights] # spice scoring expects "-1" for end-of-sequence character print(optimal, " ".join([str(t) for t in chars_weights]), file=f) return prefs, target_filename
def main(): print("Generating models....") print("\tcleanLM...") cleanLM = LanguageModel(CLEAN_TRAIN_FILE) print("\tinsultLM...") insultLM = LanguageModel(INSULT_TRAIN_FILE) print("\tcleanTestLM...") cleanTestSents = LanguageModel(CLEAN_TEST_FILE).getSents() print("\tinsultTestLM...") insultTestSents = LanguageModel(INSULT_TEST_FILE).getSents() NB = baselineNaiveBayes(cleanLM, insultLM) print("Training NB Model....") NB.train() #print NB.genProbs(cleanTestSents, insultTestSents) print("Testing NB Model....") if (STUPID_BACKOFF): tp, tn, fp, fn = NB.testStupidBackoff(cleanTestSents, insultTestSents) else: tp, tn, fp, fn = NB.testImproved1(cleanTestSents, insultTestSents) interpretResults(tp, tn, fp, fn)
def ComputeJacardSim(doc_term_matrix): print "in get_jacard_sim" setOfWords = LanguageModel.getSetOfWords(doc_term_matrix) jacard_sim_dict = LanguageModel.get_jacard_sim(setOfWords) print "get tag sim okay,length is:" + str(len(jacard_sim_dict)) return jacard_sim_dict
def test(): trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) return make_training_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM)
def __init__(self): INSULT_TRAIN_FILE = 'insult_corpus_train.txt' CLEAN_TRAIN_FILE = 'clean_corpus_train.txt' INSULT_TEST_FILE = 'insult_corpus_test.txt' CLEAN_TEST_FILE = 'clean_corpus_test.txt' self.cleanSents = LanguageModel(CLEAN_TRAIN_FILE).splitBySpaces() self.insultSents = LanguageModel(INSULT_TRAIN_FILE).splitBySpaces() self.cleanTestSents = LanguageModel(CLEAN_TEST_FILE).splitBySpaces() self.insultTestSents = LanguageModel(INSULT_TEST_FILE).splitBySpaces()
def loop_files(train_file, test_files): lm = LanguageModel('corpus/google-10000-english.txt') lm.load_weights('model/keras_char_rnn.500.h5') found = 0 for test_file in test_files: print('-------------------------') print('Starting for ', test_file) result = pipeline(train_file, test_file, lm) print(result) actual = test_file.split('_')[0] if actual in result: print('Found', actual) found += 1 print('Inference rate: ', found / len(test_files))
def __init__(self, dataset, sampleEach=1): self.path = '../data/' + dataset + '/' self.chars = codecs.open(self.path + 'chars.txt', 'r', 'utf8').read() self.wordChars = codecs.open(self.path + 'wordChars.txt', 'r', 'utf8').read() self.lm = LanguageModel( codecs.open(self.path + 'corpus.txt', 'r', 'utf8').read(), self.chars, self.wordChars) self.mats = [] self.gts = [] self.fns = [] i = 0 while True: fnMat = self.path + 'mat_' + str(i) + '.csv' fnGT = self.path + 'gt_' + str(i) + '.txt' i += 1 # file not found if (not os.path.isfile(fnMat)) or (not os.path.isfile(fnGT)): break # ignore this sample if (i - 1) % sampleEach != 0: continue # put into result self.mats.append(fnMat) self.gts.append(fnGT) self.fns.append(fnMat + '|' + fnGT) self.currIdx = 0
def __init__(self, dataset, sampleEach=1): self.path = "../data/" + dataset + "/" self.chars = codecs.open(self.path + "chars.txt", "r", "utf8").read() self.wordChars = codecs.open(self.path + "wordChars.txt", "r", "utf8").read() self.lm = LanguageModel( codecs.open(self.path + "corpus.txt", "r", "utf8").read(), self.chars, self.wordChars, ) self.mats = [] self.gts = [] self.fns = [] i = 0 while True: fnMat = self.path + "mat_" + str(i) + ".csv" fnGT = self.path + "gt_" + str(i) + ".txt" i += 1 # file not found if (not os.path.isfile(fnMat)) or (not os.path.isfile(fnGT)): break # ignore this sample if (i - 1) % sampleEach != 0: continue # put into result self.mats.append(fnMat) self.gts.append(fnGT) self.fns.append(fnMat + "|" + fnGT) self.currIdx = 0
def trainOnAllLM(self): # open clean text files for each book and join all lines text = "" books = ["AnitaBlake01GuiltyPleasures.clean.txt", "AnitaBlake02LaughingCorpse.really.clean.txt", "AnitaBlake03CircusOfTheDamned.really.clean.tx", "AnitaBlake04LunaticCafe.really.clean.txt", "AnitaBlake05BloodyBones.really.clean.txt", "AnitaBlake06TheKillingDance.really.clean.txt", "AnitaBlake07BurntOfferings.really.clean.txt", "AnitaBlake08BlueMoon.really.clean.txt", "AnitaBlake09ObsidianButterfly.really.clean.txt", "AnitaBlake10NarcissusInChains.really.clean.txt", "AnitaBlake11CeruleanSins.really.clean.txt", "AnitaBlake12IncubusDreams.really.clean.txt", "AnitaBlake16BloodNoir.really.clean.txt", "AnitaBlake17SkinTrade.really.clean.txt", "AnitaBlake18Flirt.really.clean.txt"] for book in books: text += ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', book)).read()) # sentencify text sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text) # cut out the first 15 proper sentences - dev and test sentences = sentences[17:] # wordify the sentences for i, sentence in enumerate(sentences): sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence) # train LM on corpus self.LM = LanguageModel(sentences)
def get_wer_samples(): def all_prefs(test_set): res = set() for p in test_set: p = tuple( p ) # make hashable, bit wonky to work like this but anyways will be consistent with LanguageModel expectations res.update(p[:i] for i in range(len(p) + 1)) return list(res) lm = LanguageModel(rnn) samples = [ lm.sample(cutoff=args.wer_max_len) for _ in range(args.wer_num_samples) ] gold_dict = lm.next_token_preds(all_prefs(samples)) return samples, gold_dict
def print_metrics(name, model, metric): lm = LanguageModel(model) if metric == "NDCG": temporary_model_preds_file = lm.make_spice_preds(ndcg_samples) ndcg = modified_score_rankings(temporary_model_preds_file, ndcg_target_filename) os.remove(temporary_model_preds_file) print(name, "got ndcg against rnn:", clean_val(ndcg, 5), file=f) if metric == "WER": wer = lm.WER(wer_samples, gold_dict=wer_gold) print(name, "got wer against rnn:", clean_val(wer, 5), file=f) if metric == "TIME": print(name, "took:", lapse_str(model.creation_info["extraction time"], 1), "s", file=f)
def get_pkg_desp_dict(self): print "in get_merge_model" weight_title = 1.0 weight_tag = 1.0 weight_desp = 1.0 weight_unigram = (1.0 / 1.4) * weight_desp weight_bigram = (0.4 / 1.4) * weight_desp #weight_trigram = (0.16/1.56)*weight_desp #pkg_title_count_dict = {} #pkg_tag_count_dict = {} pkg_unigram_count_dict = {} pkg_bigram_count_dict = {} #pkg_trigram_count_dict = {} #for pkg,temp_list in self.pkg_title_dict.items(): #pkg_title_count_dict[pkg] = {k:weight_title for k in set(temp_list) if k.strip()} #for pkg,temp_list in self.pkg_tag_dict.items(): #pkg_tag_count_dict[pkg] = {k:weight_tag for k in set(temp_list) if k.strip()} for pkg, temp_dict in LanguageModel.getBagOfWords( self.pkg_unigram_dict).items(): pkg_unigram_count_dict[pkg] = { k: weight_unigram * v for k, v in temp_dict.items() if k.strip() } for pkg, temp_dict in LanguageModel.getBagOfWords( self.pkg_bigram_dict, ).items(): pkg_bigram_count_dict[pkg] = { k: weight_bigram * v for k, v in temp_dict.items() if k.strip() } #for pkg,temp_dict in LanguageModel.getBagOfWords(self.pkg_trigram_dict).items(): #pkg_trigram_count_dict[pkg] = {k:weight_trigram*v for k,v in temp_dict.items() if k.strip()} pkg_desp_dict = {} for pkg in self.pkg_title_dict: #single = Counter(pkg_title_count_dict[pkg]) + Counter(pkg_tag_count_dict[pkg]) + Counter(pkg_unigram_count_dict[pkg]) + Counter(pkg_bigram_count_dict[pkg]) #+ Counter(pkg_trigram_count_dict[pkg]) single = Counter(pkg_unigram_count_dict[pkg]) + Counter( pkg_bigram_count_dict[pkg] ) #+ Counter(pkg_trigram_count_dict[pkg]) pkg_desp_dict[pkg] = single print "get desp okay,pkg_desp_dict" + str(len(pkg_desp_dict)) return pkg_desp_dict
def do_ngram(): print("~~~running ngram extraction~~~") print("making samples", end=" ... ") sample_start = process_time() samples = [] length = 0 lmrnn = LanguageModel(rnn) while length < args.ngram_total_sample_length: s = lmrnn.sample(cutoff=args.ngram_max_sample_length) samples.append(s) length += (len(s) + 1) # ending the sequence is also a sample ngrams = {} ngrams_folder = rnn_folder + "/ngram" prepare_directory(ngrams_folder) sample_time = process_time() - sample_start print("done, that took:", clock_str(sample_start)) print("making the actual ngrams", end=" ... ") with open(ngrams_folder + "/samples.txt", "w") as f: print(len(samples), len(rnn.internal_alphabet), file=f) for s in samples: print(len(s), *s, file=f) for n in args.ngram_ns: ngram_start = process_time() ngram = NGram(n, rnn.input_alphabet, samples) ngram.creation_info = { "extraction time": sample_time + process_time() - ngram_start, "size": len(ngram._state_probs_dist), "n": n, "total samples len (including EOS)": length, "num samples": len(samples), "samples cutoff len": args.ngram_max_sample_length } overwrite_file(ngram, ngrams_folder + "/" + str(n)) ngrams[n] = ngram with open(ngrams_folder + "/creation_infos.txt", "w") as f: print("ngrams made from", len(samples), "samples, of total length", length, "(including EOSs)", file=f) for n in ngrams: print("===", n, "===\n", ngrams[n].creation_info, "\n\n", file=f) print("done, that took overall", clock_str(sample_start)) return ngrams
def __init__(self, fr, en, model_file_fr, model_file_en, lex_file_fr, lex_file_en, lex_weight=1): """Initialises the language model. Args: fr/en: Foreign/English language code. model_file_fr/en (str): Foreign/English LanguageModel file name. lex_file_fr/en (str): Foreign/English Lexicon (1 word + frequency per line). lex_weight (float): Weight of the lexicon vs. the character model. """ self.lex_weight = lex_weight self.model = {} self.model[self.FR] = LanguageModel.load(model_file_fr, lex_file_fr, lex_weight) self.model[self.EN] = LanguageModel.load(model_file_en, lex_file_en, lex_weight)
def weight_avg_f1(self): classes = self.classified_tweets_by_class.keys() tuple_result_list = LanguageModel.flatten_list( self.classified_tweets_by_class.values()) sum_weighted_f1 = 0 for chosen_class in classes: chosen_class_correct_count = len([ result for result in tuple_result_list if result[ ClassifyTupleResult.CorrectClass.value] == chosen_class ]) sum_weighted_f1 = sum_weighted_f1 + self.per_class_f1[ chosen_class] * chosen_class_correct_count return round(sum_weighted_f1 / len(tuple_result_list), 4)
def __init__(self, dataset, languages, max_length=50, languageModels=None, filter_token=2, device=None): self.dataset = dataset self.languages = languages self.filter_token = filter_token self.max_length = max_length self.loadFiles() if languageModels is not None: self.languageModels = languageModels else: self.languageModels = { self.languages[0]: LanguageModel(), self.languages[1]: LanguageModel() } self.prepareLanguageModels() self.filter_unk() self.device = device
def compute_field_jacardsim(self, doc_term_dict, filename="", offset=0): '''input dict is key:term_list ''' print "in compute_field_jacardsim" if os.path.exists(filename): jacard_sim_dict = WriteTool.load_nested_dict(filename) return jacard_sim_dict weight_dict = {} if offset + s.log_min_dlcount > 0: weight_dict = { pkg: (offset + math.log(value)) / (offset + self.log_max_dlcount) for pkg, value in self.pkg_dlcount_dict.items() } field_setOfWords = LanguageModel.getSetOfWords(doc_term_dict) jacard_sim_dict = LanguageModel.get_jacard_sim(field_setOfWords, weight_dict) if filename: WriteTool.write_nested_dict(jacard_sim_dict, filename, self.pkg_titlename_dict) print str(len(jacard_sim_dict)) return jacard_sim_dict
def write_for_xiaoxi(self): temp_dict = {} sekf.pkg_unigram_tfidf_dict = LanguageModel.getTfidf( s.pkg_unigram_dict) for pkg, tempSet in s.pkg_setOfWords.items(): if pkg in s.pkg_unigram_tfidf_dict: single = { word.strip(): 1.0 for word in tempSet if word.strip() } single = WriteTool.merge_dict(single, s.pkg_unigram_tfidf_dict[pkg], "max") temp_dict[pkg] = dict(single) WriteTool.write_nested_dict(temp_dict, 'data/blend_word_list_max1')
def generate(self,hypothesis): assert set(hypothesis.internal_alphabet) == set(self.target.internal_alphabet) and (hypothesis.end_token == self.target.end_token) # order doesn't matter but they should have same letters and same EOS self.hypothesis = LanguageModel(hypothesis) self.checked = set() for n in range(self.n_cex_attempts): print("sample number:",n,"of",self.n_cex_attempts,file=self.prints_path) model = self.hypothesis if n%2 == 1 else self.target w = model.sample(cutoff=self.max_counterexample_length,empty_sequence=()) pref = self._find_disagreeing_pref(w) if not None is pref: print("found cex on attempt",n,"of",self.n_cex_attempts,file=self.prints_path) print("found by sampling:",("hypothesis" if n%2==1 else "target"),file=self.prints_path) return pref print("no counterexamples found",file=self.prints_path) return None
def main(argv): sentencesFile = "../pa6/es-en/dev/newstest2012.es" foreignFile = None nativeFile = None loadFile = "../pa6/save.model" ngramFile = None try: opts, args = getopt.getopt(argv, "is:f:n:l:g:") except getopt.GetoptError: print 'Wrong argument. Use -i for improved version' sys.exit(2) isImproved = False for opt, value in opts: if opt == '-i': isImproved = True elif opt == '-s': sentencesFile = value elif opt == '-f': foreignFile = value elif opt == '-n': nativeFile = value elif opt == '-l': loadFile = value elif opt == '-g': ngramFile = value # print "improved!" if isImproved else "Not improved!" # print sentencesFile # print foreignFile # print nativeFile # print loadFile if foreignFile and nativeFile: model = ModelOne(foreignFile, nativeFile) else: model = ModelOne(loadFile=loadFile) langModel = LanguageModel() if sentencesFile: sentences = [] with open(sentencesFile) as f: for line in f: sentences.append(line.lower().strip().split()) translated = translateSentences(sentences, model, langModel)
def trainLM(self): # open clean text and join all lines text = ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', 'AnitaBlake01GuiltyPleasures.clean.txt')).read()) # sentencify text sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text) # cut out the first 15 proper sentences - dev and test sentences = sentences[17:] # wordify the sentences for i, sentence in enumerate(sentences): sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence) # train LM on corpus self.LM = LanguageModel(sentences)
def spectral_reconstruct(model, P, S, k_list, ready_hankel_things=None, print_file=None): f = print_file if not None is print_file else sys.stdout #make sure they start with the empty sequence assert len(P[0]) == 0 assert len(S[0]) == 0 model = LanguageModel(model) print("making spectral with P,S sizes:", len(P), len(S), file=f, flush=True) if None is ready_hankel_things: stuff = make_hankel_stuff(model, P, S, f) else: stuff = ready_hankel_things results = [] total_times = [] done_max = False for k in sorted(k_list): if k >= stuff["rank"]: if done_max: print("skipping", k, "onwards", file=f) break print("maxed out at", k, "so using k=rank=", stuff["rank"], file=f ) # this allows using a k that is 'greater' than the rank, # which is important in the case the exact rank is missed (eg if the rank is 15 but its just # checking k=10,20,30, it will still do 20 but then skip 30) k = stuff[ "rank"] # don't make something higher than there actually is, making a WFA that thinks it has eg 5 states when it really has 2 done_max = True start = process_time() results.append(their_algorithm(stuff, k)) total_times.append(stuff["hankel_time"] + stuff["svd_time"] + process_time() - start) return results, total_times, stuff["hankel_time"], stuff["svd_time"], stuff
def calc_recall(self): # Order of classes: eu, ca, gl, es, en, pt per_class = dict() classes = self.classified_tweets_by_class.keys() tuple_result_list = LanguageModel.flatten_list( self.classified_tweets_by_class.values()) for chosen_class in classes: tp = len([ result for result in tuple_result_list if chosen_class == result[ ClassifyTupleResult.CalculatedClass.value] and chosen_class == result[ClassifyTupleResult.CorrectClass.value] ]) fn = len([ result for result in tuple_result_list if chosen_class != result[ ClassifyTupleResult.CalculatedClass.value] and chosen_class == result[ClassifyTupleResult.CorrectClass.value] ]) recall = round(tp / (tp + fn), 4) per_class[chosen_class] = recall self.per_class_recall = per_class
def train(self): print('\nTraining our model using file: \'{0}\''.format( self.training_file_name)) # Create a language model for each language found in the training corpus. tweets_by_lang = self.group_tweets_by_lang() num_languages = len(tweets_by_lang) count = 1 start_time = time.time() for (language, tweets) in tweets_by_lang.items(): model = LanguageModel(language, tweets, self.ngram_type, self.vocabulary, self.smoothing_value) print( '[{0}% completed]: {1} language model created. {2} tweets parsed.' .format(round((count / num_languages) * 100), Language.from_str(language).name, len(tweets))) count = count + 1 self.language_models.append(model) end_time = time.time() print( 'Training completed. A total of {0} tweets parsed. [{1}s elapsed]'. format(len(self.training_tweets_dict), round(end_time - start_time)))
class SpellChecker: def __init__(self, max_distance, channel_model=None, language_model=None, threshold=15): self.nlp = nlp self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance self.unknown_words = dict() self.threshold = threshold def load_channel_model(self, fp): self.channel_model = EditDistance.EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): bigram_prob1 = self.language_model.bigram_prob(prev_word, focus_word) bigram_prob2 = self.language_model.bigram_prob(focus_word, next_word) return (bigram_prob1 + bigram_prob2) / 2 def unigram_score(self, word): return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): return self.channel_model.prob(error_word, corrected_word) def inserts(self, word): one_insert_away = [] alphabet = string.ascii_lowercase for i in range(len(word) + 1): for letter in alphabet: new_word = word[0:i] + letter + word[i:] if new_word in self.language_model: one_insert_away.append(new_word) return one_insert_away def deletes(self, word): one_delete_away = [] for i in range(len(word)): new_word = word[0:i] + word[i + 1:] if new_word in self.language_model: one_delete_away.append(new_word) return one_delete_away def substitutes(self, word): one_sub_away = [] alphabet = string.ascii_lowercase for i in range(len(word)): for letter in alphabet: if letter != word[i]: new_word = word[0:i] + letter + word[i + 1:] if new_word in self.language_model: one_sub_away.append(new_word) return one_sub_away def generate_candidates_recurse(self, word_list, max_distance): if max_distance == 0: return word_list new_list = [] for i in word_list: insert_words = self.inserts(i) delete_words = self.deletes(i) sub_words = self.substitutes(i) new_list += insert_words new_list += delete_words new_list += sub_words set_list = list(set(new_list)) return self.generate_candidates_recurse(set_list, max_distance - 1) def generate_candidates(self, word): return self.generate_candidates_recurse([word], self.max_distance) def check_sentence(self, sentence, fallback=False): return_list = [] for i in sentence: if i in self.language_model: return_list.append([i]) continue if i in self.unknown_words: if self.unknown_words[i] > self.threshold: return_list.append([i]) continue self.unknown_words[i] += 1 elif i not in self.unknown_words: self.unknown_words[i] = 1 candidates = self.generate_candidates(i) if candidates == []: if fallback: return_list.append([i]) continue else: return_list.append([]) continue candidates = sorted( candidates, key=lambda x: self.unigram_score(x) + self.cm_score(i, x), reverse=True) return_list.append(candidates) return return_list def check_line(self, text, fallback=False): sentence_doc = nlp(text) sentences = sentence_doc.sents sentences = [ self.language_model.get_tokens(sentence) for sentence in sentences ] result = [] for sentence in sentences: checked_sentence = self.check_sentence(sentence) result += checked_sentence return result def autocorrect_sentence(self, sentence): possibilities = self.check_sentence(sentence, True) possibilities = [x[0] for x in possibilities] return possibilities def autocorrect_line(self, line): doc = nlp(line) sentences = doc.sents sentences = [ self.language_model.get_tokens(sentence) for sentence in sentences ] result = [] for sentence in sentences: checked_sentence = self.autocorrect_sentence(sentence) result += checked_sentence return result def suggest_sentence(self, sentence, max_suggestions): possibilities = self.check_sentence(sentence, True) return_list = [] for i in possibilities: if len(i) == 1: return_list.append(i[0]) else: return_list.append(i[:max_suggestions]) return return_list def suggest_line(self, text, max_suggestions): doc = nlp(text) sentences = doc.sents sentences = [ self.language_model.get_tokens(sentence) for sentence in sentences ] result = [] for sentence in sentences: checked_sentence = self.suggest_sentence(sentence, max_suggestions) result += checked_sentence return result
def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp)
def main(): print ("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print ("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print ("\tTraining NB....") NB.train() print ("\tTesting NB....") totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output1 = clf.predict(testMatrix).tolist() ## Baseline + PoS Features print ("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features") s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features") m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape((shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape((shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = svm.SVC(kernel='linear') print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() index_shuf = range(len(trainMatrix)) trainMatrix_shuf = [] trainLabel_shuf = [] shuffle(index_shuf) for i in index_shuf: trainMatrix_shuf.append(trainMatrix[i]) trainLabel_shuf.append(trainLabels[i]) train_sizes, train_scores, valid_scores = learning_curve(svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2) average_train_scores = [sum(i)/float(len(i)) for i in train_scores] average_valid_scores = [sum(i)/float(len(i)) for i in valid_scores] plt.plot(train_sizes, average_train_scores) plt.plot(train_sizes, average_valid_scores) plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5)) plt.ylabel('Score') plt.xlabel('Training examples') plt.show() # with open('SVM_output_file_with_SB.txt', 'w+') as f: # f.write("Output 1\n") # f.write("{}\n".format(output1)) # interpret_results(output1, testLabels, f) # f.write("\nOutput 2\n") # f.write("{}\n".format(output2)) # interpret_results(output2, testLabels, f) # f.write("\nOutput 3\n") # f.write("{}\n".format(output3)) # interpret_results(output3, testLabels, f) # f.write("Output 4\n") # f.write("{}\n".format(output4)) # interpret_results(output4, testLabels, f) # f.write("Output 5\n") # f.write("{}\n".format(output5)) # interpret_results(output5, testLabels, f) get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set") get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set") get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False) get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)
class HexoSpeller(MainloopFeedback): states = { "level_one": 1, # the hexagons contain groups of symbols, the group has to be picked first "level_two": 2, # the hexagons contain individual symbols } def init(self): self.send_parallel(Marker.feedback_init) self.logger.debug("HexoSpeller::init") self._last_tick_time = time.clock() self._state = self.states["level_one"] language_model_folder_path = self._create_language_model_folder_path() self.load_language_model(os.path.join(language_model_folder_path, PARAMS["language_model_file"])) self.spelled_text = [] self._sub_list_probs = [] # probability values for each symbol sublist self._selected_symbol_idx = 0 self._selected_symbol_sublist_idx = 0 self._arrow_locked = False self._arrow_locked_time = None self.lock_arrow() self._control_signal = 0 self._viz = None self._model = None def pre_mainloop(self): #print "HexoSpeller::pre_main_loop" self._model = HexoModel(PARAMS) self._model.add_arrow_length_observer(self) self._viz = HexoViz(self, VIZ_PARAMS) self._viz.hexo_controller = self self._viz.set_symbol_lists(self.symbol_list) if hasattr(ColorSchemes, VIZ_PARAMS["color_scheme"]): scheme_dictionary = getattr(ColorSchemes, VIZ_PARAMS["color_scheme"]) VIZ_PARAMS.update(scheme_dictionary) # set some public variable that can be modified from the feedback controller GUI # set all variables for which there is a setter with the corresponding name for dict in [PARAMS, VIZ_PARAMS]: for key in dict.keys(): if hasattr(self, 'set_'+key): set_method = getattr(self, 'set_'+key) set_method(dict[key]) def post_mainloop(self): """ Tries to shut down the visualization. """ self._viz.shut_down() def tick(self): """ Is called in each iteration of the main loop. This method determines how much time has passed between the current and the previous tick, and then delegates that information to the _model and the view via their tick(dt) methods. """ if self._viz==None or self._model==None: return # determine how much time (in seconds) has passed between this and the previous tick current_time = time.clock() dt = current_time - self._last_tick_time self._last_tick_time = current_time # delegate the tick to the back end and the front end self._viz.tick(dt) self._model.tick(dt) self._model.set_control_signal(self.get_control_signal()) # if the arrow is locked and the locking period is over, unlock it if self.is_arrow_locked(): if current_time - self._arrow_locked_time > self.arrow_locked_duration: self.unlock_arrow() def play_tick(self): if not self.is_arrow_locked(): self._model.play_tick() self._viz.play_tick() def pause_tick(self): self._viz.pause_tick() self._model.pause_tick() def on_control_event(self, data): self.logger.debug('on_control_event') self.set_control_signal(self._data) def on_interaction_event(self, data): self.logger.debug("on_interaction_event") if type(data)==type({}): # try to set the modified attributes for name in data.keys(): # if we have the attribute and the respective setter if hasattr(self, name) and hasattr(self, "set_"+name): set_method = getattr(self, "set_"+name) new_value = data[name] set_method(new_value) def on_play(self): self.send_parallel(Marker.status_change_to_play) MainloopFeedback.on_play(self) def on_pause(self): if self._MainloopFeedback__running and self._MainloopFeedback__paused: self.send_parallel(Marker.status_change_to_play) if self._MainloopFeedback__running and not self._MainloopFeedback__paused: self.send_parallel(Marker.status_change_to_pause) MainloopFeedback.on_pause(self) def on_stop(self): self.send_parallel(Marker.status_change_to_stop) MainloopFeedback.on_stop(self) def get_selected_hexagon_index(self): """ Returns the hexagon that the arrow is currently pointing at. """ return self._model.get_selected_hexagon_index() def get_arrow_length(self): return self._model.get_arrow_length() def get_phi_degrees(self): return self._model.get_phi_degrees() def arrow_at_max_length(self): """ To be called by the _model when the arrow has reached maximum length. """ self.logger.debug("HexoFeedback::arrow_at_max_length") if self._state == self.states["level_one"]: self.send_parallel(Marker.hex_selected_level_one) # signal the GUI to change the content of the hexagons to single symbols selected_idx = self.get_selected_hexagon_index() self.send_parallel(Marker.selected_hex_level_one[selected_idx]) self._viz.set_big_symbols(self.symbol_list[selected_idx], selected_idx) self._selected_symbol_sublist_idx = selected_idx # return the arrow to start length, but don't change the angle self.reset_arrow_model(reset_phi=False) # change to _state 'second selection' self._state = self.states["level_two"] self.lock_arrow() self._viz.start_state_change_animation() elif self._state == self.states["level_two"]: self.send_parallel(Marker.hex_selected_level_two) # get and store the selected symbol self.get_selected_symbol() self.update_symbol_list() # update the spelled word in the GUI self._viz.show_spelled_text(self.text_list_to_string(self.spelled_text)) # signal the GUI to change the content of the hexagons back to multiple symbols self._viz.set_symbol_lists(self.symbol_list) # return the arrow to start angle and start length current_phi = self._model.get_phi_degrees() new_phi = (self.get_most_probable_hexagon_index()*60 + self.hex_pre_select_bias) % 360 self.reset_arrow_model(reset_phi=True, phi=new_phi) self._state = self.states["level_one"] self.lock_arrow() self._viz.start_state_change_animation(rot_arrow=True, phi_start=current_phi, phi_end=new_phi) def reset_arrow_model(self, reset_phi=False, phi=0, control_signal=0): """ Resets the arrow length to initial length and the arrow angle and control signal value according to the given values. """ self._model.reset_arrow_length() self._model.set_control_signal(control_signal) if reset_phi: self._model.reset_phi(phi) def get_selected_symbol(self): idx = self.get_selected_hexagon_index() self._selected_symbol_idx = idx self.send_parallel(Marker.selected_hex_level_two[idx]) symbol = self._viz.get_selected_symbol(self._selected_symbol_sublist_idx, self._selected_symbol_idx) if symbol == self._language_model.delete_symbol: # if the delete symbol was selected and there is something to delete, pop the last character from the list if len(self.spelled_text) > 0: self.spelled_text.pop() elif not symbol == None: # if the symbol is not None, attach it to the spelled Text self.spelled_text.append(symbol) # send a marker idx = self._language_model.get_symbol_index(symbol) if not idx == None: self.send_parallel(Marker.selected_letter[idx]) def get_most_probable_hexagon_index(self): """ Returns the index of the hexagon that contains the most probable next letter, based on what is already written. """ return self._language_model.get_most_probable_symbol_sublist_index() def update_symbol_list(self): """ Update the order of symbols in the symbol list based on the spelled text. """ spelled_text = self.text_list_to_string(self.spelled_text) self.symbol_list = self._language_model.update_symbol_list_sorting(spelled_text) def _create_language_model_folder_path(self): """ Creates a path that points to the folder that contains the language model file. I assume that the lm files lie in a folder called "LanguageModels" which itself lies in the same folder as the HexoSpeller.py file, whose path is given by the __file__ variable. """ file_path = __file__ # file_path is now something like foo/bar/Feedbacks/HexoSpeller/HexoSpeller.py # remove the actual file name from the path by first reversing the string, then partitioning at the # last occourence of the path separator end reversing the tail of the partitioning reversed_file_path = file_path[::-1] (_file_name, _sep, hexospeller_dir) = reversed_file_path.partition(os.path.sep) hexospeller_dir = hexospeller_dir[::-1] # reverse it, now in correct order # now complete the path to point to the language model directory lm_path = os.path.join(hexospeller_dir,"LanguageModels") return lm_path def load_language_model(self, file_name): """ Get the language _model, preferably from file. The path should be specified in params... """ self._language_model = LanguageModel(file_name) self.symbol_list = self._language_model.get_symbol_list() def text_list_to_string(self, text_list): text = '' for c in text_list: text = text + c return text def set_control_signal(self, value): self._control_signal = value def get_control_signal(self): return self._control_signal def lock_arrow(self): self._arrow_locked = True self._arrow_locked_time = time.clock() def unlock_arrow(self): self._arrow_locked = False def is_arrow_locked(self): return self._arrow_locked #================================================================================ # Setter for the variables that should be # setable from the feedback controller GUI #================================================================================ def set_hexagon_default_color(self, rgb): self.hexagon_default_color = rgb if not self._viz == None: r,g,b, = rgb self._viz.set_hexagon_color(r,g,b) def set_hexagon_highlight_color(self, rgb): self.hexagon_highlight_color = rgb if not self._viz == None: r,g,b, = rgb self._viz.set_hexagon_highlight_color(r, g, b) def set_hexagon_text_color(self, rgb): self.hexagon_text_color = rgb if not self._viz == None: r,g,b, = rgb self._viz.set_hexagon_text_color(r, g, b, alpha=1) def set_arrow_color(self, rgb): self.arrow_color = rgb if not self._viz == None: r,g,b, = rgb self._viz.set_arrow_color(r,g,b) def set_state_change_animation_duration(self, dur): self.state_change_animation_duration = dur if not self._viz == None: self._viz.params['state_change_animation_duration'] = dur def set_arrow_growth_time(self, time): self.arrow_growth_time = time if not self._model == None: self._model.params['arrow_growth_time'] = time def set_arrow_shrinkage_time(self, time): self.arrow_shrinkage_time = time if not self._model == None: self._model.params['arrow_shrinkage_time'] = time def set_arrow_rotation_time(self, time): self.arrow_rotation_time = time if not self._model == None: self._model.params['arrow_rotation_time'] = time def set_arrow_locked_duration(self, duration): self.arrow_locked_duration = duration def set_control_signal_arrow_rotation_threshold(self, t): self.control_signal_arrow_rotation_threshold = t if not self._model == None: self._model.params["control_signal_arrow_rotation_threshold"] = t if not self._viz == None: self._viz.set_arrow_rotation_threshold(t) def set_control_signal_arrow_growth_threshold(self, t): self.control_signal_arrow_growth_threshold = t if not self._model == None: self._model.params["control_signal_arrow_growth_threshold"] = t if not self._viz == None: self._viz.set_arrow_growth_threshold(t) def set_control_signal_bar_frame_color(self, rgb): self.control_signal_bar_frame_color = rgb if not self._viz == None: r,g,b = rgb self._viz.set_control_signal_bar_frame_color(r,g,b) def set_control_signal_bar_color(self, rgb): self.control_signal_bar_color = rgb if not self._viz == None: r,g,b = rgb self._viz.set_control_signal_bar_color(r,g,b) def set_lm_head_factors(self, head_factors): self._language_model.head_factors = head_factors self.lm_head_factors = head_factors def set_lm_letter_factor(self, letter_factor): self._language_model.letter_factor = letter_factor self.lm_letter_factor = letter_factor def set_lm_n_pred(self, n_pred): self._language_model.n_pred = n_pred self.lm_n_pred = n_pred def set_textboard_background_color(self, rgb): self.textboard_background_color = rgb if not self._viz == None: r,g,b = rgb self._viz.set_textboard_background_color(r, g, b) def set_textboard_frame_color(self, rgb): self.textboard_frame_color = rgb if not self._viz == None: r,g,b = rgb self._viz.set_textboard_frame_color(r, g, b) def set_textboard_text_color(self, rgb): self.textboard_text_color = rgb if not self._viz == None: r,g,b = rgb self._viz.set_textboard_text_color(r, g, b) def set_background_color(self, rgb): self.backgroud_color = rgb if not self._viz == None: r,g,b = rgb self._viz.set_background_color(r,g,b) def set_hex_pre_select_bias(self, v): self.hex_pre_select_bias = v
def load_language_model(self, file_name): """ Get the language _model, preferably from file. The path should be specified in params... """ self._language_model = LanguageModel(file_name) self.symbol_list = self._language_model.get_symbol_list()
def main(): global LM # print "training language model" # trainingCorpus = HolbrookCorpus(brown.sents()) # LM = LanguageModel(trainingCorpus) # print "training complete" # print "------------------" #testLanguageModel() #tagged_corpus = cess_esp.tagged_sents() #size = int(len(tagged_corpus) * .9) #training = tagged_corpus[:size] #print "training HiddenMarkovModelTagger" #hmm_tagger = HiddenMarkovModelTagger.train(training) #print "finished training" dict_file = "../data/dictionary.txt" sentences_file = "../corpus/corpus_test.txt" tagged_corpus_file = "../data/tagged_sentences_test.txt" dictionary_lists = loadList(dict_file) sentences_lists = loadList(sentences_file) tagged_sentences = loadList(tagged_corpus_file) print "training LM..." trainingCorpus = HolbrookCorpus(brown.sents()) LM = LanguageModel(trainingCorpus) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # unigram_model = NgramModel(1, brown.words(), True, False, estimator) unigram_model = None print "finished training LM" #print sentences_lists #print dictionary_lists dictionary = dict() for entry in dictionary_lists: entry_list = entry.split() key = "" translations = [] for idx, word in enumerate(entry_list): if idx== 0: key = word.lower() else: translations.append(word) dictionary[key]=translations #print dictionary #s = 0 #for v in dictionary.values(): # s += v #s /= len(dictionary) #print "avg num values = " + str(s) #return # s = 0 # for v in dictionary.values(): # s += v # s /= len(dictionary) # print "avg num values = " + str(s) # get_translations_by_pos("momento/ncms000", {"momento": ['time/N', 'times/NP', 'moment/N', 'moments/NP']}) #testing tag method # return for idx, sentence in enumerate(sentences_lists): if sentence == "": continue #tagged_sentences.append(hmm_tagger.tag(sentence.split())) #print("") # tagged_sentences.append(hmm_tagger.tag(sentence.split())) print("") print("Sentence ",idx+1) # sentence_list = sentence.split() #tagged_list = hmm_tagger.tag(sentence.split()) tagged_sentence = tagged_sentences[idx].split() #for pair in tagged_list: # tagged_sentence.append("/".join(pair)) #print tagged_sentence sentence_list = tagged_sentence demo_translation_list = [] list_of_likely_translations = [] list_of_likely_translations= likely_translations(sentence_list,dictionary,unigram_model) list_of_likely_translations_as_strings = [] for lis in list_of_likely_translations: lis = noun_adjective_switch(lis) lis = noun_of_the_noun_switch(lis) clean_lis = remove_pos_tags_and_underscores(lis) string = ' '.join(clean_lis) list_of_likely_translations_as_strings.append(string) best = LM.n_most_likely(list_of_likely_translations_as_strings, 10) #print list_of_likely_translations ##if idx==0: # print list_of_likely_translations #for lis in list_of_likely_translations: #print lis for word in sentence.split(): # print(word) word = word.replace('.','') word = word.replace(',','') word = word.replace(':','') word = word.replace('(','') word = word.replace(')','') word = word.replace('-','') word = word.lower() #print(word) if word!='': trans = dictionary.get(word) demo_translation_list.append(trans[(idx+17)%(len(trans))-1]) #print demo_translation_list pos_free_translation_list = remove_pos_tags_and_underscores(demo_translation_list) print("Initial Translation: ",' '.join(pos_free_translation_list),) print("Final Translation: ",best[0])
# get corpus directories corpus_root_xml = nltk.data.find( 'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml') corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text' # get all xml and plain text files from specified directories corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml') corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha') # get all the words spoken by a child all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])] # init wordnet and language model corpus_ic = wn.ic(corpus_xml, True, 1.0) lm = LanguageModel(all_words) # collect all the features for each corpus for j in range(len(corpus_xml.fileids())): current_features = [] # init empty array to store features # Text initialization text_xml = corpus_xml.fileids()[j] text_plain = corpus_plain.fileids()[j] # list of words spoken by the child in lowercase child_words_xml = [ w.lower() for w in corpus_xml.words(text_xml, speaker=['CHI']) ] # list of words spoken by the child in lowercase with replaced words child_words_replaced_xml = [
else: result = eachword + ' ' + ''.join(seq_of_rest) return result, max if __name__ == '__main__': if len(sys.argv) < 4: print "Usage: python corrector.py <dev | test> <uniform | empirical> <queries file>" exit(0) queries_file = sys.argv[3] queries, gold, google = read_query_data(queries_file) kind_of_editmodel = sys.argv[2] #Read in unigram and bigram probs print >> sys.stderr, "Loading language model" languagemodel = LanguageModel('unigram_model','bigram_model') print >> sys.stderr, "Loading edit model" editmodel = EditModel(kind_of_editmodel,languagemodel) languagemodel.init_edit_model(editmodel) print >> sys.stderr,"Loading spell correct" spell_corrector = SpellCorrect(languagemodel, editmodel) answers = [] qc = 0 for eachquery in queries: answer = spell_corrector.spell_correct_query(eachquery) print answer print >> sys.stderr, "%d" % (qc) qc+=1 answers.append(answer) #Accuracy evaluation wrong = 0
class SpellChecker(): def __init__(self, max_distance, channel_model=None, language_model=None): self.nlp = spacy.load('en', pipeline=["tagger", "parser"]) self.max_distance = max_distance # self.load_channel_model(channel_model) # self.load_language_model(language_model) def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): prevFocusScore = self.language_model.bigram_prob(prev_word, focus_word) focusNextScore = self.language_model.bigram_prob(focus_word, next_word) return (prevFocusScore + focusNextScore) / 2 def unigram_score(self, word): return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): return self.channel_model.prob(error_word, corrected_word) def inserts(self, word): ''' Takes in word and return a list of words that are within one insert of word ''' # Insert every letter possibleWords = [] for letter in string.ascii_lowercase: # Every possible position for i in range(len(word) + 1): # Check if the resulting word is a word testWord = word[:i] + letter + word[i:] if self.language_model.__contains__(testWord): possibleWords.append(testWord) return possibleWords def deletes(self, word): # Delete every letter possibleWords = [] for i in range(len(word) + 1): # Check if the resulting word is a word testWord = word[:i] + word[i + 1:] if self.language_model.__contains__(testWord): possibleWords.append(testWord) return possibleWords def substitutions(self, word): # Substitute every letter possibleWords = [] for letter in string.ascii_lowercase: # At every possible position for i in range(len(word) + 1): # Check if the resulting word is a word testWord = word[:i] + letter + word[i + 1:] if self.language_model.__contains__(testWord): possibleWords.append(testWord) return possibleWords def generate_candidates(self, word): ''' Takes in a candidate word and returns words that are within self.max_distance edits of word ''' for i in range(1, self.max_distance + 1): if i == 1: candidateWords = self.inserts(word) + self.deletes( word) + self.substitutions(word) else: newWords = [] for currentWord in candidateWords: newWords += self.inserts(currentWord) + self.deletes( currentWord) + self.substitutions(currentWord) candidateWords += newWords # Get rid of duplicates return list(set(candidateWords)) def check_sentence(self, sentence, fallback=False): returnList = [] for i in range(len(sentence)): if i == 0 and i == len(sentence) - 1: prevWord = '<s>' nextWord = '</s>' elif i == 0: prevWord = '<s>' nextWord = sentence[i + 1] elif i == len(sentence) - 1: nextWord = '</s>' prevWord = sentence[i - 1] else: prevWord = sentence[i - 1] nextWord = sentence[i + 1] word = sentence[i] # If it's in the language model, add just that word if self.language_model.__contains__(word): returnList.append([word]) else: # Get all the candidates for that word candidates = self.generate_candidates(word) candidateList = [] if candidates == [] and fallback: candidateList = [word] else: for candidate in candidates: unigramScore = self.unigram_score(candidate) bigramScore = self.bigram_score( prevWord, candidate, nextWord) languageScore = (0.5*unigramScore) + \ (0.5 * bigramScore) candidateScore = languageScore + \ self.cm_score(word, candidate) candidateList.append([candidate, candidateScore]) # Sort the list by the second element candidateList.sort(key=lambda x: x[1], reverse=True) # Remove the second element, and append candidateList = [x[0] for x in candidateList] returnList += [candidateList] return returnList def check_text(self, text, fallback=False): ''' take a string as input, tokenize and sentence segment it with spacy, and then return the concatenation of the result of calling check_sentence on all of the resulting sentence objects. ''' tokens = self.nlp(text) sentences = list(tokens.sents) processedSentences = [] for sentence in sentences: # Convert sentence into list of lowercase words wordList = sentence.text.split() wordList = [x.lower() for x in wordList] processedSentences.append(self.check_sentence(wordList, fallback)) return processedSentences def autocorrect_sentence(self, sentence): ''' take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence with fallback=True, and return a new list of tokens where each non-word has been replaced by its most likely spelling correction. ''' corrections = self.check_sentence(sentence, fallback=True) return [x[0] for x in corrections] def autocorrect_line(self, line): ''' take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling autocorrect_sentence on all of the resulting sentence objects. ''' tokens = self.nlp(line) sentences = list(tokens.sents) processedSentences = [] for sentence in sentences: # Convert sentence into list of lowercase words wordList = sentence.text.split() if len(wordList) == 0: continue wordList = [x.lower() for x in wordList] processedSentences.append(self.autocorrect_sentence(wordList)) return processedSentences def suggest_sentence(self, sentence, max_suggestions): ''' take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence, and return a new list where: Real words are just strings in the list Non-words are lists of up to max_suggestions suggested spellings, ordered by your model’s preference for them. ''' sentenceCorrections = self.check_sentence(sentence) returnList = [] for word in sentenceCorrections: if len(word) == 1: returnList += word else: returnList.append(word[:max_suggestions]) return returnList def suggest_text(self, text, max_suggestions): ''' take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling suggest_sentence on all of the resulting sentence objects ''' tokens = self.nlp(text) sentences = list(tokens.sents) processedSentences = [] for sentence in sentences: # Convert sentence into list of lowercase words wordList = sentence.text.split() wordList = [x.lower() for x in wordList] # Get rid of the period if wordList[-1][-1] == '.': wordList[-1] = wordList[-1][:-1] processedSentences.append( self.suggest_sentence(wordList, max_suggestions)) return processedSentences
def main(): print ("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print ("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print ("\tTraining NB....") NB.train() print ("\tTesting NB....") totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output1 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features print ("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features") s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features") m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape((shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape((shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() with open('LOG_REG_output_file_w_SB.txt', 'w+') as f: f.write("Output 1\n") f.write("{}\n".format(output1)) interpret_results(output1, testLabels, f) f.write("\nOutput 2\n") f.write("{}\n".format(output2)) interpret_results(output2, testLabels, f) f.write("\nOutput 3\n") f.write("{}\n".format(output3)) interpret_results(output3, testLabels, f) f.write("Output 4\n") f.write("{}\n".format(output4)) interpret_results(output4, testLabels, f) f.write("Output 5\n") f.write("{}\n".format(output5)) interpret_results(output5, testLabels, f)
class MT: def __init__(self, file, ngrams): self.dictionary = self.read_json(file) self.ngrams = self.read_json(ngrams) self.stemmer = SnowballStemmer("german") self.tagger = POStagger() self.trainOnAllLM() def translate(self, file): translated_sentences = [] sentences = self.read_json(file) dev = self.tagger.tag(sentences['test']) # print dev for line in dev: clauses = self.split_line(line) translated_clauses = [] for clause in clauses: LL = [] clause = self.reorder_dependent_clause(clause) clause = self.reorder_obj_subj(clause) clause = self.reorder_participles(clause) clause = self.reorder_modals(clause) clause = self.recombine_sep_prefixes(clause) # words = self.reorder_adverbs(words) clause = self.interpolate_idioms(clause) clause = self.split_compounds(clause) for word in clause: word = word.replace('.', '') LL.append(self.lookup(word)) # print LL[-1] translated_clauses.append(self.refine_word_choice(LL)) translated_sentences.append(translated_clauses) # engSent.append(LL) translation = [] for sentence in translated_sentences: trans = "" for clause in sentence: clauz = "" for word in clause: clauz += " "+word trans += ", "+clauz translation.append(trans) return translation #return translated_sentences def refine_word_choice(self, LL): output = [[]] for wordList in LL: numPrefix = len(output) numWords = len(wordList) if len(wordList) > 1: tmp = [None]*(numWords*numPrefix) for i in range(numWords): for k in range(numPrefix): tmp[i*numPrefix+k] = copy.copy(output[k]) output = tmp for i, word in enumerate(wordList): for itr in range(numPrefix): output[i*numPrefix+itr].append(word) bestScore = float("-inf") index = 0 for i, sent in enumerate(output): currScore = self.LM.score(sent) if currScore > bestScore: bestScore = currScore index = i print bestScore return output[index] def reorder_dependent_clause(self, words): pairs = [x.split('_') for x in words] if re.match('V[VAM]FIN', pairs[-1][-1]): changed = False """ First look for an auxilliary verb to which to attach our end verb. """ # for i in range(len(pairs) - 1, -1, -1): # if '_VAFIN' in pairs[i][-1]: # words = words[:i + 1] + [words[-1]] + words[i + 1:-1] # # changed = True # # break # elif '_KON' in pairs[i][-1]: # # Give up at the first conjunction # break if not changed: """ Find where to put it. Look for a pair of noun phrases or articles or prepositions and put it between them. Note that we can't cross a conjuction, though. """ first = -1 second = -1 conjunction = -1 # First we assume that articles do not substitute for nouns for i, pair in enumerate(pairs[:-1]): if pair[-1] == 'PPER' or pair[-1] == 'PPOSS' or pair[-1] == 'PWS' or \ pair[-1] == 'NN' or pair[-1] == 'NE' or pair[-1] == 'PDS': if first < 0: first = i elif second < 0: second = i else: print ("Three subjects/objects in " + str(words)) break elif pair[-1] == 'KON' or pair[-1] == 'KOUS' or pair[-1] == 'KOKOM': first = -1 second = -1 conjunction = i if first >= 0 and second < 0: first = -1 conjunction = -1 first_is_article = False second_is_article = False # An article may be substituting for the second noun. Try again for i, pair in enumerate(pairs[:-1]): if pair[-1] == 'ART' or pair[-1] == 'CARD': if first < 0: first = i first_is_article = True elif second < 0: second = i second_is_article = True else: print ("Three subjects/objects in " + str(words)) break elif pair[-1] == 'PPER' or pair[-1] == 'PPOSS' or pair[-1] == 'PWS' or \ pair[-1] == 'NN' or pair[-1] == 'NE' or pair[-1] == 'PDS': if second < 0 and (first < 0 or first_is_article): first = i first_is_article = False elif second < 0 or second_is_article: second = i second_is_article = False else: print ("Three subjects/objects in " + str(words)) break elif pair[-1] == 'KON' or pair[-1] == 'KOUS' or pair[-1] == 'KOKOM': first = -1 second = -1 conjunction = i if second < 0: # Nope. Maybe an article is subbing for the first noun. Try again. first = -1 conjunction = -1 first_is_article = False # An article may be substituting for the second noun. Try again for i, pair in enumerate(pairs[:-1]): if pair[-1] == 'ART' or pair[-1] == 'CARD': if first < 0: first = i elif second < 0: second = i second_is_article = True else: print ("Three subjects/objects in " + str(words)) break elif pair[-1] == 'PPER' or pair[-1] == 'PPOSS' or pair[-1] == 'PWS' or \ pair[-1] == 'NN' or pair[-1] == 'NE' or pair[-1] == 'PDS': if first < 0: first = i elif second < 0 or second_is_article: second = i second_is_article = False else: print ("Three subjects/objects in " + str(words)) break elif pair[-1] == 'KON' or pair[-1] == 'KOUS' or pair[-1] == 'KOKOM': first = -1 second = -1 conjunction = i if first < 0 and conjunction < 0: # Move the verb into the first position words = words[-1:] + words[:-1] if first < 0: # Move the verb into the first position words = words[:conjunction + 1] + words[-1:] + words[conjunction + 1:-1] else: words = words[:first + 1] + words[-1:] + words[first + 1:-1] return words @staticmethod def read_json(file): with codecs.open(file, encoding='utf8') as f: sentences = json.load(f, encoding='utf8') return sentences @staticmethod def split_line(line): phrases = [] words = [] for word in re.findall(r'[^\s]+_(?:[A-Z]+|\$[.,(])', line): if len(word) > 0 and word[-2] == '$': phrases.append(words) words = [] elif len(word) > 0: words.append(word) if len(words) > 0: phrases.append(words) return phrases """ Find phrases in the source sentence and replace them with idiomatic translations. """ def interpolate_idioms(self, words): new = words changed = True while changed: changed = False for i in range(len(new) - 1): for j in range(len(new), i + 1, -1): phrase = ' '.join([x.split('_')[0] for x in new[i:j]]) if phrase in self.dictionary['idioms']: new = new[:i] + [self.dictionary['idioms'][phrase] + '_IDIOM'] + new[j:] changed = True break if changed: break if changed: break return new def lookup(self, word): parts = word.split('_') translation = [parts[0]] if parts[1] != 'IDIOM': if parts[0] in self.dictionary['words']: translation = self.dictionary['words'][parts[0]] elif parts[0].lower() in self.dictionary['words']: translation = self.dictionary['words'][parts[0].lower()] if parts[1].startswith('V'): translation = [self.from_tense(w, self.get_tense(parts[0], parts[1])) for w in translation] return translation def verb_stem(self, verb): stem = verb m = REGULAR_PATTERN.match(verb) if m: stem = m.group(1) else: # Must be irregular present or past (1S or 3S), but how did we # not already find it in the dictionary? raise Exception("Didn't find %s in the dictionary" % verb) return stem def from_tense(self, verb, tense): words = verb.split(' ') rest = [] if len(words) > 1: verb = words[0] rest = words[1:] new = verb if verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense == '1': new = self.dictionary['verbs'][verb][0][0] elif verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense == '2': new = self.dictionary['verbs'][verb][0][1] elif verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense == '3': new = self.dictionary['verbs'][verb][0][2] elif verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense[-1] == '+': new = self.dictionary['verbs'][verb][0][3] elif verb in self.dictionary['verbs'] and tense == 'PP': new = self.dictionary['verbs'][verb][-1] elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense == '1P': new = self.dictionary['verbs'][verb][-2][0] elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense == '2P': new = self.dictionary['verbs'][verb][-2][1] elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense == '3P': new = self.dictionary['verbs'][verb][-2][2] elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense[-2:] == '+P': new = self.dictionary['verbs'][verb][-2][3] elif verb in self.dictionary['verbs'] and tense[-1] == 'P': new = self.dictionary['verbs'][verb][-2] elif tense[-1] == 'P' and verb[-1] == 'e': new = verb + 'd' elif tense[-1] == 'P' and re.match('.*[^aeiou][aeiou][b-df-hj-np-tvwyz]$', verb): new = verb + verb[-1] + 'ed' elif tense[-1] == 'P': new = verb + 'ed' # elif tense == 'I' and verb[-1] == 'e': # new = verb[:-1] + 'ing' # elif tense == 'I': # new = verb + 'ing' elif tense == '2' and verb[-1] == 'o': new = verb + 'es' elif tense == '2': new = verb + 's' return ' '.join([new] + rest) def get_tense(self, verb, tag): words = verb.split(' ') if len(words) > 1: verb = words[0] tense = None if tag.endswith('PP'): tense = 'P' elif tag.endswith('INF'): tense = 'I' elif verb in self.dictionary['tenses']: tense = self.dictionary['tenses'][verb] if tense == 'P': tense = '1P' #1st or 3rd, we don't care which elif tense == 'I': tense = '1+' #1st or 3rd, we don't care which else: m = REGULAR_PATTERN.match(verb) if m and m.group(1) in self.dictionary['tenses']: # If it's in the dictionary, it must be strong simple past tense = self.dictionary['tenses'][m.group(1)] if tense == 'P' and m.group(2) == 't': tense = '2+P' elif tense == 'P' and m.group(2) == 'st': tense = '2P' elif tense == 'P' and (m.group(2) == 'en' or m.group(2) == 'n'): tense = '1+P' #1st or 3rd, we don't care which elif m: # And now we're left with present or weak simple past if m.group(2) == 'e': tense = '1' elif m.group(2) == 't': # This could also be 2nd plural, in which case we'll get it wrong tense = '3' elif m.group(2) == 'st': tense = '2' elif m.group(2) == 'en': tense = '1+' #1st or 3rd, we don't care which elif m.group(2) == 'ete': tense = '1P' elif m.group(2) == 'etet': # This could also be 2nd plural, in which case we'll get it wrong for weak verbs! tense = '3P' elif m.group(2) == 'etest': tense = '2P' elif m.group(2) == 'eten': tense = '1+P' #1st or 3rd, we don't care which else: raise Exception('unexpected verb ending: %s/%s' % (m.group(1), m.group(2))) else: raise Exception("verb doesn't match pattern: %s/%s" % (verb, m.group(1))) return tense def split_compounds(self, words): split = [] for word in words: parts = word.split('_') if parts[0] not in self.dictionary['words'] and parts[0].lower() not in self.dictionary['words']: if parts[1] == 'NN': split.extend(self.split_noun(parts[0], parts[1])) elif parts[1] == 'APPRART': split.extend(self.split_preposition(parts[0], parts[1])) else: split.append(word) else: split.append(word) return split def split_noun(self, word, tag): words = [] i = len(word) while i > 0: if word[:i] in self.dictionary['words']: if i < len(word): rest = self.split_noun(word[i].upper() + word[i+1:], tag) if rest: words.append(word[0:i] + '_' + tag) words.extend(rest) break else: words.append(word[0:i] + '_' + tag) break i -= 1 if not words: words.append(word + '_' + tag) return words def split_preposition(self, preposition, tag): split = [] if preposition in COMPOUND_PREPOSITIONS: split.extend(COMPOUND_PREPOSITIONS[preposition]) else: split.append(preposition + '_' + tag) return split def reorder_participles(self, words): # find clause that ends with VVPP, VAPP, or VMPP new_words = words check = ["VVPP", "VAPP", "VMPP"] for c in check: if c in words[-1]: # find the preceding VA* for i, word in enumerate(words[:-1]): if "VA" in word: # move last word into pos after prec. VA* new_words = words[:i+1] new_words.append(words[-1]) new_words.extend(words[i+1:-1]) break return new_words def reorder_modals(self, words): # find clause that ends with VVPP, VAPP, or VMPP new_words = words check = ["_VVINF", "_VAINF", "_VMINF"] for c in check: if c in words[-1]: # find the preceding VM* for i, word in enumerate(words[:-1]): if "_VM" in word: # move last word into pos after prec. VA* new_words = words[:i+1] new_words.append(words[-1]) new_words.extend(words[i+1:-1]) break return new_words def reorder_obj_subj(self, words): new_words = words # find first verb for i, word in enumerate(words[:-2]): if '_V' in word: # We can only be certain about ich, du, and er. if words[i + 1] == 'ich_PPER' or words[i + 1] == 'du_PPER' or words[i + 1] == 'er_PPER': new_words = [words[i + 1]] + [words[i]] + words[:i] + words[i + 2:] break return new_words def recombine_sep_prefixes(self, words): # find clause ends with PTKVZ new_words = words if "_PTKVZ" in words[-1]: # find the preceding VVFIN for i, word in enumerate(words[:-1]): if "_VVFIN" in word: # move the last word into pos after prec. VVFIN new_words = words[:-1] new_words[i] = words[i].split('_')[0] + ' ' + words[-1].split('_')[0] + '_VVFIN' # new_words = words[:i + 1] # new_words.append(words[-1]) # new_words.extend(words[i + 1:-1]) break return new_words def reorder_adverbs(self, words): # find any ADV that follows any V* new_words = words for i, word in enumerate(words): if "ADV" in word: for w in words[:i]: if "_V" in w: # find the preceding VV* for j, wurd in enumerate(words[:i]): if "VV" in wurd: # move the ADV into before prec. VV* new_words = words[:j-1] new_words.append(words[i]) new_words.extend(words[j-1:i]) new_words.extend(words[i+1:]) break break return new_words def trainLM(self): # open clean text and join all lines text = ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', 'AnitaBlake01GuiltyPleasures.clean.txt')).read()) # sentencify text sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text) # cut out the first 15 proper sentences - dev and test sentences = sentences[17:] # wordify the sentences for i, sentence in enumerate(sentences): sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence) # train LM on corpus self.LM = LanguageModel(sentences) def trainOnAllLM(self): # open clean text files for each book and join all lines text = "" books = ["AnitaBlake01GuiltyPleasures.clean.txt", "AnitaBlake02LaughingCorpse.really.clean.txt", "AnitaBlake03CircusOfTheDamned.really.clean.tx", "AnitaBlake04LunaticCafe.really.clean.txt", "AnitaBlake05BloodyBones.really.clean.txt", "AnitaBlake06TheKillingDance.really.clean.txt", "AnitaBlake07BurntOfferings.really.clean.txt", "AnitaBlake08BlueMoon.really.clean.txt", "AnitaBlake09ObsidianButterfly.really.clean.txt", "AnitaBlake10NarcissusInChains.really.clean.txt", "AnitaBlake11CeruleanSins.really.clean.txt", "AnitaBlake12IncubusDreams.really.clean.txt", "AnitaBlake16BloodNoir.really.clean.txt", "AnitaBlake17SkinTrade.really.clean.txt", "AnitaBlake18Flirt.really.clean.txt"] for book in books: text += ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', book)).read()) # sentencify text sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text) # cut out the first 15 proper sentences - dev and test sentences = sentences[17:] # wordify the sentences for i, sentence in enumerate(sentences): sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence) # train LM on corpus self.LM = LanguageModel(sentences) def permutationTester(self, sentence): # generate all order permutations of words in the sentence orig = sentence sentences = list(itertools.permutations(orig, len(orig))) # score each sentence and pick the best max = [self.LM.score(orig), orig] for sentence in sentences: score = self.LM.score(sentence) if score > max[0]: max = [score, sentence] print ("\n Best Sentence:") print (max) return (max[1]) '''