示例#1
0
def prepare_ngrams_in_text(text: str, n: int) \
        -> Generator[Tuple[List[int], List[str], int, int], None, None]:
    tokens = TOKENIZER.tokenize(text)
    token_spans = align_tokens(tokens, text)
    tagged_words = nltk.pos_tag(tokens)

    words2 = []
    for i in range(len(tagged_words)):
        span = token_spans[i]
        word = text[span[0]:span[1]]
        pos = tagged_words[i][1]
        features = address_features.get_word_features(word, pos)
        words2.append((word, span, pos, features))

    for i in range(len(words2)):
        word = words2[i][0]
        word_start_pos = words2[i][1][0]
        word_end_pos = words2[i][1][1]
        features = list()
        for j in range(i - n, i + n - 1):
            if 0 <= j < len(words2):
                features.extend(words2[j][3])
            else:
                features.extend(address_features.ZERO_FEATURES)
        yield features, word, word_start_pos, word_end_pos
示例#2
0
def prepare_ngrams_in_text(text: str, window_half_width: int, window_step: int) \
        -> Generator[Tuple[List[int], List[str], int, int], None, None]:
    words2 = []

    for word, pos_token, word_start_pos, word_end_pos in TOKENIZER.get_token_spans(text):
        features = address_features.get_word_features(word, pos_token)
        # our tokenizer returns exact word_end_pos and we need it so that text[word_start_pos:word_end_pos] == word
        words2.append((word, pos_token, word_start_pos, word_end_pos + 1, features))

    i = 0
    while i < len(words2):
        word, pos_token, word_start_pos, word_end_pos, features = words2[i]
        features = list()
        for j in range(i - window_half_width, i + window_half_width):
            if 0 <= j < len(words2):
                features.extend(words2[j][4])
            else:
                features.extend(address_features.ZERO_FEATURES)
        yield features, word, word_start_pos, word_end_pos
        i += window_step