def getWords(data): words_arr = [] for _def in data: sentence = _def['sent'] sentence = data_helpers.preprocess_data(sentence) words_arr += data_helpers.basic_tokenizer(sentence) def_word = _def['def_word'] def_word = data_helpers.preprocess_data(def_word) words_arr += [def_word] return words_arr
def load_data_plain(data_path): x_arr = [] y_arr = [] # sentence_id = -1 with open(data_path, 'r') as f: lines = f.readlines()[1:] for line in lines: line = line.strip() cols = line.split(' ') # if sentence_id != cols[1]: sent = cols[2] sentiment = int(cols[3]) sentence_id = cols[1] sent = data_helpers.preprocess_data(sent) x_arr.append(sent) y_arr.append(sentiment) # else: # continue return x_arr, y_arr
def load_test_data_plain(data_path): x_arr = [] phrase_arr = [] # sentence_id = -1 with open(data_path, 'r') as f: lines = f.readlines()[1:] for index, line in enumerate(lines): line = line.strip() cols = line.split(' ') # if sentence_id != cols[1]: # print(line) # print(cols) # print(index) # print() if len(cols) == 3: sent = cols[2] else: sent = "" sentence_id = cols[1] phrase_id = cols[0] sent = data_helpers.preprocess_data(sent) x_arr.append(sent) phrase_arr.append(phrase_id) # else: # continue return x_arr, phrase_arr