def load_test_data(self, test_file, max_length, vocabulary=None, config=None): """ Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ contents = util.read_txt(test_file) lines = [line for line in contents] labels = [] x_text = [] x_text_len = [] y_text = None if config is None: for s in lines: cur_line = s.split()[:max_length] x_text.append(cur_line) x_text_len.append(len(cur_line)) else: y = [] label_dict = util.read_txt_to_dict(config) for line in lines: line = line.split(' <> ') cur_line = line[1].split()[:max_length] x_text.append(cur_line) x_text_len.append(len(cur_line)) labels.append(line[0]) label_num = label_dict[line[0].strip()] y.append(label_num) y_text = np.array(y) sentences_padded = util.pad_sentences(x_text, max_length) vocabulary = util.read_pickle(vocabulary) x = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in sentences_padded]) x_text_len = np.array(x_text_len) return x, x_text_len, contents, labels, y_text
def get_voc_idx(self, ques, rela): # pad sentence pad = lambda x: util.pad_sentences(x, self.max_sent_len) pad_lst = lambda x: map(pad, x) self.ques_pad = map(pad, ques) self.rela_pad = map(pad_lst, rela) # Represent sentences as list(nparray) of ints idx_func = lambda word: self.word_dict[word] if self.word_dict.has_key( word) else self.word_dict["unk"] u_idx_func = lambda words: map(idx_func, words) v_idx_func = lambda words_list: map(u_idx_func, words_list) return map(u_idx_func, self.ques_pad), map(v_idx_func, self.rela_pad)
def load_data(self, train_file, config, max_length, vocabulary=None): """ Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data sentences, sen_lens, labels, n_class = self.load_data_and_labels(train_file, config, max_length) sentences_padded = util.pad_sentences(sentences, max_length) vocabulary_inv = None if vocabulary is None: vocabulary, vocabulary_inv = self.build_vocab(sentences_padded) x, y = self.build_input_data(sentences_padded, labels, vocabulary) x_len = np.array(sen_lens) return [x, x_len, y, vocabulary, vocabulary_inv, n_class]
#x_text = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T& C's apply 08452810075over18's", "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."] x_text = [ "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.", "SMS SERVICES. for your inclusive text credits, pls goto www.comuk.net login= 3qxj9 unsubscribe wit h STOP, no extra charge. help 08702840625.COMUK. 220-CM2 9AE" ] #labels, sentences = get_data_and_labels(lines) lines_chars_level_features = generate_char_level_features( x_text, params['max_chars_features']) lines_chars_level_features = np.array(lines_chars_level_features) seq_len = params['max_chars_features'] x = pad_sentences(lines_chars_level_features, max_sequence_length=seq_len, is_max_sequence_length_modifiable=False) x = text_to_sequence(x, vocabulary) print("Generate predictions") predictions = model.predict(x) count = 0 for text in x_text: print("Text is: \t", text) if (predictions[count] > 0.5): print("predicted spam with spam prob ", predictions[count]) else: print("predicted ham with spam prob ", predictions[count]) count += 1
lines_chars_level_features = generate_char_level_features( sentences, params['max_chars_features']) params['max_chars_features'] = max( [len(lines) for lines in lines_chars_level_features]) lines_chars_level_features = np.array(lines_chars_level_features) # Build vocabulary print("Build the vocabulary") vocabulary = build_vocab(lines_chars_level_features, max_vocab_size=10000) #print(vocabulary) # Pad sentence print("Padding sentences...") x_text = pad_sentences(lines_chars_level_features, max_sequence_length=params['max_chars_features']) seq_len = len(x_text[0]) print("The sequence length is: ", seq_len) # Represent sentence with char index, using char index to represent a sentence x = text_to_sequence(x_text, vocabulary) # Shuffle data #np.random.seed(1) #same shuffling each time shuffle_indices = np.random.permutation(np.arange(len(labels))) x = x[shuffle_indices] labels = labels[shuffle_indices] """ ## Build CNN model """