def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse = [list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse=pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+')) allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+')) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector y_train = np.zeros((25000,), dtype=np.float32) y_test = np.zeros((25000,), dtype=np.float32) y_train[12500:25000] = np.ones((12500,), dtype=np.float32) y_test[12500:25000] = np.ones((12500,), dtype=np.float32) x_seq= np.zeros((50000, (max_len - 3) * 4), dtype=np.int) for i in range(50000): for j in range(max_len - 3): x_seq[i, j * 4] = x[i, j] x_seq[i, j * 4 + 1] = x[i][j + 1] + num_words x_seq[i, j * 4 + 2] = x[i][j + 2] + num_words * 2 x_seq[i, j * 4 + 3] = x[i][j + 3] + num_words * 3 x_train_0 = x[:25000] x_train_1 = x_reverse[:25000] x_train_2=x_seq[:25000] x_test_0 = x[25000:] x_test_1 = x_reverse[25000:] x_test_2=x_seq[25000:] result=[] indice = np.arange(25000) np.random.shuffle(indice) result.append(x_train_0[indice]) result.append(x_train_1[indice]) result.append(x_train_2[indice]) result.append(x_test_0[indice]) result.append(x_test_1[indice]) result.append(x_test_2[indice]) result.append(y_train[indice]) result.append(y_test[indice]) result.append(embedding_matrix) return result
def update_datasets(self, filter=None): if filter is None: filter = self._filter file_list = [] log.info("Updateing datasets from file list: %s", self._source_file) if self._source_file.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._source_file, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._source_file) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] directory_name = "/".join(parts[:-1]) match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) input_file.close()
def update_datasets(self, filter=None): if filter is None: filter = self._filter close_file = True log.info("Updateing datasets from file list: %s", self._input_source) if hasattr(self._input_source, 'read'): input_file = self._input_source close_file = False elif isinstance(self._input_source, str) and self._input_source.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._input_source, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._input_source) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) if close_file: input_file.close()
def prepare_train(): print("prepare training data") f = FileIO(os.path.join(FLAGS.buckets, 'texts.pkl'), 'rb') text1 = pickle.load(f) text1 = text1[:25000] f.close() f = FileIO(os.path.join(FLAGS.buckets, 'texts_unsup.pkl'), 'rb') text2 = pickle.load(f) f.close() texts = text1 + text2 tokenizer = Tokenizer(num_words=vocab_size) tokenizer.filters = '' tokenizer.fit_on_texts(texts) sequence = tokenizer.texts_to_sequences(texts) sequence_pad = pad_sequences(sequence, maxlen=MAX_DOCUMENT_LENGTH + 1, dtype=np.int32, padding='post', truncating='post') seq_len = [] for i in range(len(sequence)): r = len(sequence[i]) if r < MAX_DOCUMENT_LENGTH: seq_len.append(r) else: seq_len.append(MAX_DOCUMENT_LENGTH) x_1 = sequence_pad[:, :-1] y_ = sequence_pad[:, 1:] return x_1, seq_len, y_
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) # sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index sequences = [] for i in range(50000): t = [] tokens = texts[i].lower().split(' ') for j in range(len(tokens)): index = word_index.get(tokens[j], 0) if index < num_words: t.append(index) else: t.append(0) sequences.append(t) print('Found %s unique tokens.' % len(word_index)) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - 3) * 4), dtype=np.int) Xtest = np.zeros((25000, (max_len - 3) * 4), dtype=np.int) for i in range(25000): for j in range(max_len - 3): Xtrain[i, j * 4] = data1[i, j] Xtrain[i, j * 4 + 1] = data1[i][j + 1] + num_words Xtrain[i, j * 4 + 2] = data1[i][j + 2] + num_words * 2 Xtrain[i, j * 4 + 3] = data1[i][j + 3] + num_words * 3 for i in range(25000): for j in range(max_len - 3): Xtest[i, j * 4] = data2[i, j] Xtest[i, j * 4 + 1] = data2[i][j + 1] + num_words Xtest[i, j * 4 + 2] = data2[i][j + 2] + num_words * 2 Xtest[i, j * 4 + 3] = data2[i][j + 3] + num_words * 3 indice = np.arange(25000) np.random.shuffle(indice) Xtrain = Xtrain[indice] Ytrain = Ytrain[indice] Xtest = Xtest[indice] Ytest = Ytest[indice] return Xtrain, Ytrain, Xtest, Ytest
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) # word_index = tokenizer.word_index # sequences = [] # for i in range(50000): # t = [] # tokens = texts[i].lower().split(' ') # for j in range(len(tokens)): # index = word_index.get(tokens[j], 0) # if index < num_words: # t.append(index) # else: # t.append(0) # sequences.append(t) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) Xtest = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) for i in range(25000): for j in range(max_len - 1): Xtrain[i, j * 2] = data1[i, j] Xtrain[i, j * 2 + 1] = data1[i][j + 1] + num_words for i in range(25000): for j in range(max_len - 1): Xtest[i, j * 2] = data2[i, j] Xtest[i, j * 2 + 1] = data2[i][j + 1] + num_words indice = np.arange(25000) np.random.shuffle(indice) Xtrain = Xtrain[indice] Ytrain = Ytrain[indice] Xtest = Xtest[indice] Ytest = Ytest[indice] return Xtrain, Ytrain, Xtest, Ytest
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "rt/text.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse = [list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse=pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+')) allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+')) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector y = np.zeros((num_data,), dtype=np.float32) y[5331:] = np.ones((5331,), dtype=np.float32) x_seq= np.zeros((num_data, (max_len - 2) * 3), dtype=np.int) for i in range(num_data): for j in range(max_len - 2): x_seq[i, j * 3] = x[i, j] x_seq[i, j * 3 + 1] = x[i][j + 1] + num_words x_seq[i, j * 3 + 2] = x[i][j + 2] + num_words * 2 result=[] indice = np.arange(num_data) np.random.shuffle(indice) result.append(x[indice]) result.append(x_reverse[indice]) result.append(x_seq[indice]) result.append(y[indice]) result.append(embedding_matrix) return result
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "20news/texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[:num_train]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse = [list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse = pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load( FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+')) allwords = pickle.load( FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+')) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector y_train = np.load( FileIO(os.path.join(FLAGS.buckets, "20news/Ytrain.npy"), mode='r+')) y_train = to_categorical(y_train) y_test = np.load( FileIO(os.path.join(FLAGS.buckets, "20news/Ytest.npy"), mode='r+')) y_test = to_categorical(y_test) x_seq = np.zeros((num_train + num_test, (max_len - 2) * 3), dtype=np.int) for i in range(num_train + num_test): for j in range(max_len - 2): x_seq[i, j * 3] = x[i, j] x_seq[i, j * 3 + 1] = x[i][j + 1] + num_words x_seq[i, j * 3 + 2] = x[i][j + 2] + num_words * 2 x_train_0 = x[:num_train] x_train_1 = x_reverse[:num_train] x_train_2 = x_seq[:num_train] x_test_0 = x[num_train:] x_test_1 = x_reverse[num_train:] x_test_2 = x_seq[num_train:] result = [] indice1 = np.arange(num_train) np.random.shuffle(indice1) indice2 = np.arange(num_test) np.random.shuffle(indice2) result.append(x_train_0[indice1]) result.append(x_train_1[indice1]) result.append(x_train_2[indice1]) result.append(x_test_0[indice2]) result.append(x_test_1[indice2]) result.append(x_test_2[indice2]) result.append(y_train[indice1]) result.append(y_test[indice2]) result.append(embedding_matrix) return result
def main(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.filters = '' tokenizer.fit_on_texts(texts[0:25000]) # print(texts[0]) # sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index sequences = [] for i in range(50000): t = [] tokens = texts[i].lower().split(' ') for j in range(len(tokens)): index = word_index.get(tokens[j], 0) if index < num_words: t.append(index) else: t.append(0) sequences.append(t) print('Found %s unique tokens.' % len(word_index)) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000, ), dtype=np.float32) Ytest = np.zeros((25000, ), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500, ), dtype=np.float32) Ytest[12500:25000] = np.ones((12500, ), dtype=np.float32) Xtrain1 = np.zeros((25000, (max_len - 2) * 3), dtype=np.int) Xtest1 = np.zeros((25000, (max_len - 2) * 3), dtype=np.int) for i in range(25000): for j in range(max_len - 2): Xtrain1[i, j * 3] = data1[i, j] Xtrain1[i, j * 3 + 1] = data1[i][j + 1] + num_words Xtrain1[i, j * 3 + 2] = data1[i][j + 2] + num_words * 2 for i in range(25000): for j in range(max_len - 2): Xtest1[i, j * 3] = data2[i, j] Xtest1[i, j * 3 + 1] = data2[i][j + 1] + num_words Xtest1[i, j * 3 + 2] = data2[i][j + 2] + num_words * 2 Xtrain2 = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) Xtest2 = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) for i in range(25000): for j in range(max_len - 1): Xtrain2[i, j * 2] = data1[i, j] Xtrain2[i, j * 2 + 1] = data1[i][j + 1] + num_words for i in range(25000): for j in range(max_len - 1): Xtest2[i, j * 2] = data2[i, j] Xtest2[i, j * 2 + 1] = data2[i][j + 1] + num_words indice1 = np.arange(25000) np.random.shuffle(indice1) Xtrain1 = Xtrain1[indice1] Xtrain2 = Xtrain2[indice1] Ytrain = Ytrain[indice1] indice2 = np.arange(25000) np.random.shuffle(indice2) Xtest1 = Xtest1[indice2] Xtest2 = Xtest2[indice2] Ytest = Ytest[indice2] print('begin to build model ...') input1 = Input(shape=((max_len - 2) * 3, )) embedding1 = Embedding(num_words * 3, embedding_dimension, input_length=(max_len - 2) * 3, init='orthogonal')(input1) x = AveragePooling1D(pool_length=3)(embedding1) x = GlobalMaxPooling1D()(x) input2 = Input(shape=((max_len - 1) * 2, )) embedding2 = Embedding(num_words * 2, embedding_dimension, input_length=(max_len - 1) * 2, init='orthogonal')(input2) y = AveragePooling1D(pool_length=2, stride=2)(embedding2) y = GlobalMaxPooling1D()(y) z = merge([x, y], mode='concat') # model.add(Dropout(0.5)) output = Dense(1, activation='sigmoid')(z) model = Model(input=[input1, input2], output=output) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy']) model.fit([Xtrain1, Xtrain2], Ytrain, batch_size=32, nb_epoch=20, verbose=2, validation_data=([Xtest1, Xtest2], Ytest))
def main(): global ngram f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.filters='' tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) # word_index = tokenizer.word_index # sequences = [] # for i in range(50000): # t = [] # tokens = texts[i].lower().split(' ') # for j in range(len(tokens)): # index = word_index.get(tokens[j], 0) # if index < num_words: # t.append(index) # else: # t.append(0) # sequences.append(t) # print('Found %s unique tokens.' % len(word_index)) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - ngram + 1) * ngram), dtype=np.int) Xtest = np.zeros((25000, (max_len - ngram + 1) * ngram), dtype=np.int) id_range = np.arange(max_len - ngram + 1) for i in range(ngram): Xtrain[:, id_range * ngram + i] = data1[:, id_range + i] + num_words * i Xtest[:, id_range * ngram + i] = data2[:, id_range + i] + num_words * i print('begin to build model ...') main_input = Input(shape=((max_len - ngram + 1) * ngram,)) # embedding1 = Embedding(num_words * ngram, word_dim, embeddings_initializer=keras.initializers.Orthogonal())(main_input) embedding1 = Embedding(num_words * ngram, word_dim)(main_input) x = AveragePooling1D(pool_size=ngram)(embedding1) x = GlobalMaxPooling1D()(x) weight = np.ones((word_dim, 1), dtype=np.float) weight[int(word_dim / 2):] = -1 * np.ones([int(word_dim / 2), 1], dtype=np.float) output = Dense(1, weights=[weight, np.zeros([1])], trainable=False, activation='sigmoid')(x) model = Model(input=main_input, output=output) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy']) model.fit([Xtrain], Ytrain, batch_size=32, shuffle=True, nb_epoch=15, verbose=2, validation_data=([Xtest], Ytest))