def getWords(data): words_arr = [] for _def in data: sentence = _def['sent'] sentence = data_helpers.preprocess_data(sentence) words_arr += data_helpers.basic_tokenizer(sentence) def_word = _def['def_word'] def_word = data_helpers.preprocess_data(def_word) words_arr += [def_word] return words_arr
def prepare_sentence_tokens(sentences): print('Tokenizing sentences...') in_sent_arr = [] in_token_arr = [] max_seq_len = 0 for line in sentences: tokenized_in = data_helpers.basic_tokenizer(line) if len(tokenized_in) > max_seq_len: max_seq_len = len(tokenized_in) in_sent_arr.append(line) in_token_arr.append(tokenized_in) print("Done Tokenizing") print("max_seq_len") print(max_seq_len) return in_sent_arr, in_token_arr, max_seq_len