Пример #1
0
    def __call__(self, text):
        tokens = list(self.tokenizer(text))
        tokens_lemmas = lemmatize([t.text for t in tokens], self._morph)
        tags = self.network.predict_for_token_batch([tokens_lemmas])[0]

        previous_tag = null_tag = 'O'
        previous_tokens = []

        for token, current_tag in zip(
                itertools.chain(tokens, [None]),
                itertools.chain(tags, [null_tag])
        ):
            if current_tag.startswith('I'):
                previous_tokens.append(token)
            elif previous_tag != null_tag:
                yield Match(
                    previous_tokens,
                    Span(
                        previous_tokens[0].span[0],
                        previous_tokens[-1].span[1],
                    ),
                    previous_tag[-3:]
                )
            if current_tag.startswith('B'):
                previous_tokens = [token]
            previous_tag = current_tag
Пример #2
0
    def __call__(self, text):
        tokens = list(self.tokenizer(text))
        tokens_lemmas = lemmatize([t.text for t in tokens], self._morph)
        tags = self.network.predict_for_token_batch([tokens_lemmas])[0]

        previous_tag = null_tag = 'O'
        previous_tokens = []

        for token, current_tag in zip(
                itertools.chain(tokens, [None]),
                itertools.chain(tags, [null_tag])
        ):
            if current_tag.startswith('I'):
                previous_tokens.append(token)
            elif previous_tag != null_tag:
                yield Match(
                    previous_tokens,
                    Span(
                        previous_tokens[0].span[0],
                        previous_tokens[-1].span[1],
                    ),
                    previous_tag[-3:]
                )
            if current_tag.startswith('B'):
                previous_tokens = [token]
            previous_tag = current_tag
Пример #3
0
def print_predict(sentence):
    # Split sentence into tokens
    tokens = tokenize(sentence)

    # Lemmatize every token
    # Example: был -> быть, его -> он
    tokens_lemmas = lemmatize(tokens)

    tags = network.predict_for_token_batch([tokens_lemmas])[0]
    for token, tag in zip(tokens, tags):
        print(token, tag)
Пример #4
0
def print_predict(sentence, network, f=sys.stdout, threshold=0.2):
    # Split sentence into tokens
    tokens = tokenize(sentence)

    # Lemmatize every token
    tokens_lemmas = lemmatize(tokens)

    tags, logits = network.predict_for_token_batch([tokens_lemmas])
    tags, logits = tags[0], logits[0]
    o_idx = network.corpus.tag_dict.toks2idxs(['O'])

    predicted_tags = []
    last_number = None
    last_tk = None
    for token, tag, l in zip(tokens, tags, logits):
        if is_number(token.lower()):
            last_number = normalize_num(token)
        second_best = np.argsort(l)[-2]
        third_best = np.argsort(l)[-3]
        if tag == 'O':
            ratio = l[second_best] / l[third_best]
            #if ratio * l[second_best] > .2 and token not in '.?,\':!':
            if ratio * l[
                    second_best] > threshold and token not in '.?,\':!' and token not in stopwords.words(
                        'english'):
                tag = network.corpus.tag_dict.idxs2toks([second_best])[0]
        elif tag.startswith('B') and (token in stopwords.words('english')
                                      or token in '.?,\':!'):
            tag = 'O'


#        print(token, tag, file=f)
        if tag.startswith('B') and token not in stopwords.words(
                'english') and token not in '.?,\':!':
            if 'calendric' in tag and 'night' in token and last_number is not None:
                predicted_tags.append(('nights', last_number))
                last_number = None
            elif 'people' in tag and last_number is not None:
                predicted_tags.append(('people', last_number))
                last_number = None
            elif 'performers' in tag and last_number is not None:
                predicted_tags.append(('stars', last_number))
                last_number = None
            elif 'gpe' in tag:
                if last_tk == 'to':
                    predicted_tags.append(('toloc.city_name', token))
                else:
                    predicted_tags.append(('fromloc.city_name', token))
            else:
                predicted_tags.append((tag[2:], token))
        last_tk = token
    return predicted_tags
Пример #5
0
 def _preprocess_task(self, task):
     # tokens = tokenize(task)
     tokens = task.split(' ')
     tokens_lemmas = lemmatize(tokens)
     return tokens_lemmas
Пример #6
0
 def _preprocess_humaninput(self, task):
     tokens = nltk.tokenize.wordpunct_tokenize(task)
     tokens_lemmas = lemmatize(tokens)
     return tokens_lemmas