def __preprocess(): (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) word_index = imdb.get_word_index() max_len = 500 x_train = pad_sequences(train_data, maxlen=max_len) x_test = pad_sequences(test_data, maxlen=max_len) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') return (x_train, y_train), (x_test, y_test), word_index, max_len
def on_epoch_end(self, batch, logs=None): val_targ = self.validation_data[-3] val_value = [x for x in self.validation_data[0:-3]] y_pred = np.asarray(self.model.predict(val_value)) precision, recall, f_score, _ = precision_recall_fscore_support( val_targ, (y_pred > 0.5).astype(int), average='micro') print("— val_f1: % f — val_precision: % f — val_recall % f" % (f_score, precision, recall))
def on_epoch_end(self, epoch, logs={}): # val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round() val_predict = np.argmax(np.asarray( self.model.predict(self.validation_data[0])), axis=1) # val_targ = self.validation_data[1] val_targ = np.argmax(self.validation_data[1], axis=1) _val_f1 = f1_score(val_targ, val_predict, average='macro') # _val_recall = recall_score(val_targ, val_predict) # _val_precision = precision_score(val_targ, val_predict) self.val_f1s.append(_val_f1) # self.val_recalls.append(_val_recall) # self.val_precisions.append(_val_precision) # print('— val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall)) print(' — val_f1:', _val_f1) return
def main(args): # set parameters: max_features = 5000 maxlen = 400 batch_size = 32 embedding_dims = 128 filters = 250 kernel_size = 3 hidden_dims = 250 epochs = 10 # we start off with an efficient embedding layer which maps print('Loading data...') X_train, y_train, X_test, y_test, tokenizer_train, tokenizer_test = load_split_data( args) y_train = np.asarray(y_train).astype('float32') y_test = np.asarray(y_test).astype('float32') vocab_size_train = len(tokenizer_train.word_index) + 1 vocab_size_test = len(tokenizer_test.word_index) + 1 print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, padding='post') X_test = sequence.pad_sequences(X_test, maxlen=len(X_train[0]), padding='post') print('x_train shape:', X_train.shape) print('x_test shape:', X_test.shape) print('vocab_size_train', vocab_size_train) seqX_len = len(X_train[0]) print('seqX_len', seqX_len) seqY_len = len(y_train[0]) print('Build model...') model = Sequential() # our vocab indices into embedding_dims dimensions model.add( Embedding(input_dim=vocab_size_train, output_dim=embedding_dims, input_length=seqX_len)) model.add(Dropout(0.2)) # we add a Convolution1D, which will learn filters # word group filters of size filter_length: model.add(Conv1D(filters, kernel_size, padding='same', strides=1)) model.add(BatchNormalization()) model.add(Activation('relu')) # we use max pooling: model.add(MaxPooling1D(strides=1)) model.add(Conv1D(filters, kernel_size, padding='same', strides=1)) model.add(BatchNormalization()) model.add(Activation('relu')) # we use max pooling: model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(0.2)) model.add(BatchNormalization()) model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(units=5)) model.add(Activation('sigmoid')) optimizer = Adam(lr=0.000001) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) checkpointer = ModelCheckpoint(filepath='./drive/text_cnn' + '.{epoch:02d}-{val_loss:.2f}.hdf5', verbose=1, save_best_only=True, monitor='val_acc', mode='max') csv_logger = CSVLogger('./drive/text_cnn.log') model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), callbacks=[checkpointer, csv_logger]) with (open('./drive/text_cnn_imdb_model.json', 'w')) as f: f.write(model.to_json())