Пример #1
0
 def _load_data(self):
     """Loads data for POS from SENNA dump"""
     md = Metadata.load_from_file('pos')
     self.nn, word_dict, suff = load_network()
     self.reader = POSReader()
     self.reader.word_dict = word_dict
     self.reader.create_converter(md)
     self.itd = self.reader.get_inverse_tag_dictionary()
     self.nn.padding_left = self.reader.converter.get_padding_left()
     self.nn.padding_right = self.reader.converter.get_padding_right()
     self.nn.pre_padding = np.array([self.nn.padding_left] * 2)
     self.nn.pos_padding = np.array([self.nn.padding_right] * 2)
     Suffix.codes = {}
     for i, s in enumerate(suff):
         Suffix.codes[s] = i
     Suffix.other = Suffix.codes['NOSUFFIX']
Пример #2
0
class SennaPOSTagger(Tagger):
    """A POSTagger loads the models and performs POS tagging on text."""
    
    def _load_data(self):
        """Loads data for POS from SENNA dump"""
        md = Metadata.load_from_file('pos')
        self.nn, word_dict, suff = load_network()
        self.reader = POSReader()
        self.reader.word_dict = word_dict
        self.reader.create_converter(md)
        self.itd = self.reader.get_inverse_tag_dictionary()
        self.nn.padding_left = self.reader.converter.get_padding_left()
        self.nn.padding_right = self.reader.converter.get_padding_right()
        self.nn.pre_padding = np.array([self.nn.padding_left] * 2)
        self.nn.pos_padding = np.array([self.nn.padding_right] * 2)
        Suffix.codes = {}
        for i, s in enumerate(suff):
            Suffix.codes[s] = i
        Suffix.other = Suffix.codes['NOSUFFIX']
    
    def tag(self, text=None):
        """
        Tags the given text.
        
        :param text: a string or unicode object. Strings assumed to be utf-8
        :returns: a list of lists (sentences with tokens).
            Each sentence has (token, tag) tuples.
        """
        result = []
        if text:
            tokens = utils.tokenize(text, clean=False)
            for sent in tokens:
                tags = self.tag_tokens(sent)
                result.append(zip(sent, tags))
        else:
            # read tsv from stdin
            sent = []
            for line in sys.stdin:
                line = line.decode('utf-8').strip()
                if line:
                    sent.append(line.split()[0])
                else:
                    #ipdb.set_trace()
                    tags = self.tag_tokens(sent)
                    result.append(zip(sent, tags))
                    sent = []

        return result
    
    def tag_tokens(self, tokens):
        """
        Tags a given list of tokens. 
        
        Tokens should be produced with the nlpnet tokenizer in order to 
        match the entries in the vocabulary. If you have non-tokenized text,
        use POSTagger.tag(text).
        
        :param tokens: a list of strings
        :returns: a list of strings (the tags)
        """
        converter = self.reader.converter
        # do not use clean_text. Attardi
        #converted_tokens = np.array([converter.convert(utils.clean_text(token, False)) 
        converted_tokens = converter.convert(tokens)
        answer = self.nn.tag_sentence(converted_tokens)
        tags = [self.itd[tag] for tag in answer]
        return tags