示例#1
0
def getWords(data):
    words_arr = []
    for _def in data:
        sentence = _def['sent']
        sentence = data_helpers.preprocess_data(sentence)
        words_arr += data_helpers.basic_tokenizer(sentence)

        def_word = _def['def_word']
        def_word = data_helpers.preprocess_data(def_word)
        words_arr += [def_word]

    return words_arr
示例#2
0
def prepare_sentence_tokens(sentences):
    print('Tokenizing sentences...')
    in_sent_arr = []
    in_token_arr = []

    max_seq_len = 0

    for line in sentences:
        tokenized_in = data_helpers.basic_tokenizer(line)

        if len(tokenized_in) > max_seq_len:
            max_seq_len = len(tokenized_in)

        in_sent_arr.append(line)
        in_token_arr.append(tokenized_in)

    print("Done Tokenizing")
    print("max_seq_len")
    print(max_seq_len)

    return in_sent_arr, in_token_arr, max_seq_len