def test_preprocess_sent(): s = "I live in New York City" assert preprocess_sent(s) == ["live", "new", "york", "city"] s = "How old are you ?" assert preprocess_sent(s) == ["how", "old"] s = "apples, pears and oranges" assert preprocess_sent(s) == ["apple", "pear", "orange"] s = "let's run a race together!" assert preprocess_sent(s) == ["let", "'s", "run", "race", "together"]
def select_synonym_with_context(self, word, sentence): # preprocess new sentence and get list of tokens s_tok = preprocess_sent(sentence) if word not in s_tok: return [] choices = select_relevant_contexts(word, self.syn_dict, self.context_list) if not choices: # no contexts found for list of word synonym return [] if len(choices) == 1: # only one potential synonym to be picked --> meaningless, return the source as well result = choices[0][0] return [result, word] # get new context new_context = extract_context_from_sent(word, s_tok) # build dict of all syn and their score scores_d = dict() for row in choices: syn, context = row[0], row[1:] s = score_context(new_context, context) # don't keep null scores if s > 0: scores_d[syn] = s result = [ k for k, v in scores_d.items() if v == max(scores_d.values()) ] return result
def preprocess_enron_text(): """ reads enron raw data file and writes preprocessed sentences to text file """ outfile = open(enron_text_preprocessed,"w+") with open(enron_text_file) as f: index = 0 for line in f: print "processing line {0}".format(index) index += 1 tokens = preprocess_sent(line) outfile.write( " ".join(tokens)+ "\n") outfile.close()
def get_sentences_for_word(word, syn_dict, pos): result = [] clear = False for ss in wn.synsets(word, pos=pos): ss_added = False for lemma in ss.lemma_names(): if lemma in syn_dict[word]: ss_added = True for sent in ss.examples(): if word in preprocess_sent(sent): row = [word, sent, lemma, pos] result.append(row) if not ss_added and not clear: for sent in ss.examples(): if word in preprocess_sent(sent): clear = True result.append([word, sent, word, pos]) break if len(result) > 1: return result else: return []
def select_synonym_with_vectors(word, sentence, syn_dict, pos): s_tok = preprocess_sent(sentence) if word not in s_tok: return context_v = nlp(u" ".join(extract_context_from_sent(word, s_tok))) scores_d = dict() for syn in syn_dict[word]: scores_d[syn] = context_v.similarity(nlp(syn)) result = max(scores_d, key=scores_d.get) # suggest synonym only if score high enough if scores_d[result] > 0.3: return result else: return