def test_preprocess_sent():
	s = "I live in New York City"
	assert preprocess_sent(s) == ["live", "new", "york", "city"]
	s = "How old are you ?"
	assert preprocess_sent(s) == ["how", "old"]
	s = "apples, pears and oranges"
	assert preprocess_sent(s) == ["apple", "pear", "orange"]
	s = "let's run a race together!"
	assert preprocess_sent(s) == ["let", "'s", "run", "race", "together"]
示例#2
0
    def select_synonym_with_context(self, word, sentence):
        # preprocess new sentence and get list of tokens
        s_tok = preprocess_sent(sentence)
        if word not in s_tok:
            return []

        choices = select_relevant_contexts(word, self.syn_dict,
                                           self.context_list)
        if not choices:
            # no contexts found for list of word synonym
            return []
        if len(choices) == 1:
            # only one potential synonym to be picked --> meaningless, return the source as well
            result = choices[0][0]
            return [result, word]
        # get new context
        new_context = extract_context_from_sent(word, s_tok)
        # build dict of all syn and their score
        scores_d = dict()
        for row in choices:
            syn, context = row[0], row[1:]
            s = score_context(new_context, context)
            # don't keep null scores
            if s > 0:
                scores_d[syn] = s
        result = [
            k for k, v in scores_d.items() if v == max(scores_d.values())
        ]
        return result
def preprocess_enron_text():
	""" reads enron raw data file and writes preprocessed sentences to text file """
	outfile = open(enron_text_preprocessed,"w+")
	with open(enron_text_file) as f:
		index = 0
		for line in f:
			print "processing line {0}".format(index)
			index += 1
			tokens = preprocess_sent(line)
			outfile.write( " ".join(tokens)+ "\n")
	outfile.close()
示例#4
0
def get_sentences_for_word(word, syn_dict, pos):
    result = []
    clear = False
    for ss in wn.synsets(word, pos=pos):
        ss_added = False
        for lemma in ss.lemma_names():
            if lemma in syn_dict[word]:
                ss_added = True
                for sent in ss.examples():
                    if word in preprocess_sent(sent):
                        row = [word, sent, lemma, pos]
                        result.append(row)
        if not ss_added and not clear:
            for sent in ss.examples():
                if word in preprocess_sent(sent):
                    clear = True
                    result.append([word, sent, word, pos])
                    break
    if len(result) > 1:
        return result
    else:
        return []
def select_synonym_with_vectors(word, sentence, syn_dict, pos):
	s_tok = preprocess_sent(sentence)
	if word not in s_tok:
		return
	context_v = nlp(u" ".join(extract_context_from_sent(word, s_tok)))
	scores_d = dict()
	for syn in syn_dict[word]:
		scores_d[syn] = context_v.similarity(nlp(syn))

	result = max(scores_d, key=scores_d.get)
	# suggest synonym only if score high enough
	if scores_d[result] > 0.3:
		return result
	else:
		return