def _load_data(self): """Loads data for POS from SENNA dump""" md = Metadata.load_from_file('pos') self.nn, word_dict, suff = load_network() self.reader = POSReader() self.reader.word_dict = word_dict self.reader.create_converter(md) self.itd = self.reader.get_inverse_tag_dictionary() self.nn.padding_left = self.reader.converter.get_padding_left() self.nn.padding_right = self.reader.converter.get_padding_right() self.nn.pre_padding = np.array([self.nn.padding_left] * 2) self.nn.pos_padding = np.array([self.nn.padding_right] * 2) Suffix.codes = {} for i, s in enumerate(suff): Suffix.codes[s] = i Suffix.other = Suffix.codes['NOSUFFIX']
class SennaPOSTagger(Tagger): """A POSTagger loads the models and performs POS tagging on text.""" def _load_data(self): """Loads data for POS from SENNA dump""" md = Metadata.load_from_file('pos') self.nn, word_dict, suff = load_network() self.reader = POSReader() self.reader.word_dict = word_dict self.reader.create_converter(md) self.itd = self.reader.get_inverse_tag_dictionary() self.nn.padding_left = self.reader.converter.get_padding_left() self.nn.padding_right = self.reader.converter.get_padding_right() self.nn.pre_padding = np.array([self.nn.padding_left] * 2) self.nn.pos_padding = np.array([self.nn.padding_right] * 2) Suffix.codes = {} for i, s in enumerate(suff): Suffix.codes[s] = i Suffix.other = Suffix.codes['NOSUFFIX'] def tag(self, text=None): """ Tags the given text. :param text: a string or unicode object. Strings assumed to be utf-8 :returns: a list of lists (sentences with tokens). Each sentence has (token, tag) tuples. """ result = [] if text: tokens = utils.tokenize(text, clean=False) for sent in tokens: tags = self.tag_tokens(sent) result.append(zip(sent, tags)) else: # read tsv from stdin sent = [] for line in sys.stdin: line = line.decode('utf-8').strip() if line: sent.append(line.split()[0]) else: #ipdb.set_trace() tags = self.tag_tokens(sent) result.append(zip(sent, tags)) sent = [] return result def tag_tokens(self, tokens): """ Tags a given list of tokens. Tokens should be produced with the nlpnet tokenizer in order to match the entries in the vocabulary. If you have non-tokenized text, use POSTagger.tag(text). :param tokens: a list of strings :returns: a list of strings (the tags) """ converter = self.reader.converter # do not use clean_text. Attardi #converted_tokens = np.array([converter.convert(utils.clean_text(token, False)) converted_tokens = converter.convert(tokens) answer = self.nn.tag_sentence(converted_tokens) tags = [self.itd[tag] for tag in answer] return tags