def annotate(self): data = cherrypy.request.json # Peel off sentence input ("s" field) and produce a response list like: # [(0, 7), {raw: "foo", difficulty: 0.12, replacement: "bar", definition: "blah blah", url: "bleep"}] s = data.get("s", u"") # parse_result = self.nlp.parse(s) parse_result = self.nlp_parse_request(s) print >> sys.stderr, parse_result response = [] for token in parse_result: difficulty = -int(token["log_prob"]) raw = token["as_is"] if raw.lower() in self.hard_words: difficulty += 3 output_this = (token["entity_position"] == 'O' and not token["is_punct"]) and difficulty > 7 if output_this: x1 = token["start_idx"] x2 = token["end_idx"] url = "http://blah" definition = "blah blah" synsets = wn.synsets(raw.lower()) synset_words = [] hypernym_synset_words = [] for s in synsets: synset_words += s.lemma_names() hypernym_synsets = s.hypernyms() for ss in hypernym_synsets: hypernym_synset_words += ss.lemma_names() print >> sys.stderr, "synset_words:", synset_words print >> sys.stderr, "hypernym_synset_words:", hypernym_synset_words print >> sys.stderr, "part_of_speech:", token["pos"] highest_logprob = token["log_prob"] replacement = "" for w in synset_words + ["BREAK"] + hypernym_synset_words: # If we've found a replacement in the base sysnset words, stop here. # Otherwise proceed to hypernyms if w == "BREAK" and replacement: break inflected_form = conjugate.find_form(raw, token["lemma"], w, token["pos"]) print >> sys.stderr, "ALTERNATE:", w, inflected_form is_easy_word = (w in self.easy_words or inflected_form in self.easy_words) is_hard_word = (w in self.hard_words or inflected_form in self.hard_words) if w != raw and inflected_form != raw and is_easy_word: logprob = self.nlp_wordprob_request(w) print >> sys.stderr, "LOGPROB", logprob if logprob is not None and logprob > highest_logprob: highest_logprob = logprob replacement = inflected_form break if not replacement: continue result_obj = {"raw": token["as_is"], "difficulty": difficulty, "replacement": replacement, "definition": definition, "url": url} response.append(((x1, x2), result_obj)) return response
def test_verb_form(): assert find_form(u'singing', u'sing', u'ring', u'VERB') == u'ringing' assert find_form(u'drank', u'drink', u'think', u'VERB') == u'thought' assert find_form(u'drink', u'drink', u'think', u'VERB') == u'think'
def test_noun_form(): # NB pattern.en pluralizes 'hippopotamus' as 'hippopotamuss' :( :( :( assert find_form(u'cats', u'cat', u'child', u'NOUN') == u'children' assert find_form(u'children', u'child', u'cat', u'NOUN') == u'cats' assert find_form(u'cat', u'cat', u'hippopotamus', u'NOUN') == u'hippopotamus'
def test_adjective_form(): assert find_form(u'bigger', u'big', u'absolute', u'ADJ') == u'more absolute' assert find_form(u'biggest', u'big', u'small', u'ADJ') == u'smallest'