def run_nltktagger(string, nltk_data_path=None): """ Runs nltk tagger on `string` and returns a list of :class:`quepy.freeling.Word` objects. """ assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: nltk.data.path = nltk_data_path from nltk.corpus import wordnet if not _penn_to_morphy_tag: _penn_to_morphy_tag = { u'NN': wordnet.NOUN, u'JJ': wordnet.ADJ, u'VB': wordnet.VERB, u'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well #tokens = nltk.word_tokenize(string) tokens = nltk.wordpunct_tokenize(string) tags = nltk.pos_tag(tokens) words = [] for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). if sys.version_info[0] == 3: word.pos = pos.split("|")[0] else: word.pos = pos.split("|")[0].decode("ascii") mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) if isinstance(lemma, str): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. if sys.version_info[0] == 2: lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() words.append(word) return words
def run_nltktagger(string, nltk_data_path=None): """ Runs nltk tagger on `string` and returns a list of :class:`quepy.freeling.Word` objects. """ assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: nltk.data.path = nltk_data_path from nltk.corpus import wordnet if not _penn_to_morphy_tag: _penn_to_morphy_tag = { u'NN': wordnet.NOUN, u'JJ': wordnet.ADJ, u'VB': wordnet.VERB, u'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well #tokens = nltk.word_tokenize(string) tokens = nltk.wordpunct_tokenize(string) tags = nltk.pos_tag(tokens) words = [] for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). word.pos = pos.split("|")[0].decode("ascii") mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) if isinstance(lemma, str): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() words.append(word) return words
def _read_line(text): """ Parses a line of the freeling command line output. """ assert_valid_encoding(text) assert u"#" in text start, text = text.split(u"#", 1) start = start.strip().rsplit(u" ", 1)[0] text = text.strip() token_has_spaces = False if start.count(u" ") > 2: token = FREELING_FUNCTION_OUTPUT_REGEX.match(start) assert not token is None token = token.group() token_has_spaces = True else: token = start.split(u" ")[0] if token_has_spaces: text = text.replace(token, u"<token>") text = text.split(u" ") assert len(text) % 4 == 0 best_word = None while text: word = Word(token) word.sense = text.pop() try: word.prob = float(text.pop()) except ValueError: raise TaggingError(u"The probability field of a" u" word was non-numerical") if word.prob < 0 or word.prob > 1: raise TaggingError(u"The probability field of a" u" word was not a probability") word.pos = text.pop() word.lemma = text.pop() if word.pos in (u"NNP", u"MR"): word.token = word.token.replace(u"_", u" ") if word.token == u"?" and word.pos == u"Fit": word.pos = u"." if not best_word or word.prob > best_word.prob: best_word = word return best_word