def infer(j, y_test, model_name, stamp, window, categories, k, models, text_P, subset_ratio, subset_seed, min_len, max_len, min_tokens, categories_mode, return_overall, max_words, vectorizer, test_size, test_random_state, data_len, test_len): category = categories[j] print('Predicting category `{}`...'.format(category)) y_pred = np.zeros((len(y_test), ), dtype=np.int32) for i in range(len(y_test)): P = text_P[i] q_pred = base.predict_ordinal(models, P, k) label_pred = max(q_pred) y_pred[i] = label_pred base_fname = '{}_{:d}_{:d}w'.format(stamp, j, window) logs_path = folders.ensure(os.path.join(folders.LOGS_PATH, model_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('HYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nVectorization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('vectorizer={}\n'.format(vectorizer.__class__.__name__)) fd.write('\nTraining\n') fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(data_len)) fd.write('Test size: {:d}\n\n'.format(test_len)) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, model_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category)
def train(skip_models=False): max_words = shared_parameters.TEXT_MAX_WORDS start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time print('Time stamp: {:d}'.format(stamp)) # Load data. print('Retrieving texts...') source = 'paragraph_tokens' subset_ratio = .1 #shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN min_tokens = shared_parameters.DATA_MIN_TOKENS remove_stopwords = False categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL inputs, Y, categories, category_levels = \ bookcave.get_data({source}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, remove_stopwords=remove_stopwords, categories_mode=categories_mode, return_overall=return_overall) text_source_tokens = list(zip(*inputs[source]))[0] print('Retrieved {:d} texts.'.format(len(text_source_tokens))) # Create vectorized representations of the book texts. print('Vectorizing text...') text_tokens = [] for source_tokens in text_source_tokens: all_tokens = [] for tokens in source_tokens: all_tokens.extend(tokens) text_tokens.append(all_tokens) vectorizer = tokenizers.get_vectorizer_or_fit(max_words, remove_stopwords, text_tokens=text_tokens) X = vectorizer.transform(text_tokens) print('Vectorized text with {:d} unique words.'.format( len(vectorizer.get_feature_names()))) # Split data set. test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE Y_T = Y.transpose() # (n, c) X_train, X_test, Y_train_T, Y_test_T = train_test_split( X, Y_T, test_size=test_size, random_state=test_random_state) Y_train = Y_train_T.transpose() # (c, n * (1 - b)) Y_test = Y_test_T.transpose() # (c, n * b) create_funcs = [ create_k_nearest_neighbors, create_logistic_regression, create_multi_layer_perceptron, create_multinomial_naive_bayes, create_random_forest, create_svm ] model_names = [ 'k_nearest_neighbors', 'logistic_regression', 'multi_layer_perceptron', 'multinomial_naive_bayes', 'random_forest', 'svm' ] for m, create_func in enumerate(create_funcs): model_name = model_names[m] model_path = folders.ensure( os.path.join(folders.MODELS_PATH, model_name)) print('Training model `{}`...'.format(model_name)) for j, category in enumerate(categories): print('Classifying category `{}`...'.format(category)) y_train = Y_train[j] # (n * (1 - b)) k = len(category_levels[j]) classifiers = fit_ordinal(create_func, X_train, y_train, k) y_pred = predict_ordinal(classifiers, X_test, k) # (n * b) y_test = Y_test[j] base_fname = '{:d}_{:d}'.format(stamp, j) logs_path = folders.ensure( os.path.join(folders.LOGS_PATH, model_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('HYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nVectorization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('vectorizer={}\n'.format( vectorizer.__class__.__name__)) fd.write('\nTraining\n') fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(X.shape[0])) fd.write('Train size: {:d}\n'.format(X_train.shape[0])) fd.write('Test size: {:d}\n\n'.format(X_test.shape[0])) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, model_name)) with open( os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) if not skip_models: models_path = folders.ensure( os.path.join(model_path, base_fname)) for i, classifier in enumerate(classifiers): with open( os.path.join(models_path, 'model{:d}.pickle'.format(i)), 'wb') as fd: pickle.dump(classifier, fd, protocol=pickle.HIGHEST_PROTOCOL) print('Done.')
def main(): parser = ArgumentParser( description='Classify the maturity level of a book by its text.', formatter_class=RawTextHelpFormatter) parser.add_argument('category_index', type=int, help='The category index.\n {}'.format('\n '.join([ '{:d} {}'.format(j, bookcave.CATEGORY_NAMES[category]) for j, category in enumerate(bookcave.CATEGORIES) ]))) parser.add_argument('--source_mode', default='paragraph', choices=['paragraph', 'sentence'], help='The source of text. Default is `paragraph`.') parser.add_argument('--net_mode', default='cnn', choices=['rnn', 'cnn', 'rnncnn'], help='The type of neural network. Default is `cnn`.') parser.add_argument('--remove_stopwords', action='store_true', help='Remove stop-words from text. Default is False.') parser.add_argument( '--agg_mode', default='maxavg', choices=['max', 'avg', 'maxavg', 'rnn'], help= 'The way the network will aggregate paragraphs or sentences. Default is `maxavg`.' ) parser.add_argument('--label_mode', default=shared_parameters.LABEL_MODE_ORDINAL, choices=[ shared_parameters.LABEL_MODE_ORDINAL, shared_parameters.LABEL_MODE_CATEGORICAL, shared_parameters.LABEL_MODE_REGRESSION ], help='The way that labels will be interpreted. ' 'Default is `{}`.'.format( shared_parameters.LABEL_MODE_ORDINAL)) parser.add_argument( '--remove_classes', type=str, help= 'Remove classes altogether. Can be used when the minority class is severely tiny. ' 'Like `<class1>[,<class2>,...]` as in `3` or `3,0`. Optional.') parser.add_argument( '--class_weight_p', default=2, type=int, help='Power with which to scale class weights. Default is 2.') parser.add_argument( '--embedding_trainable', action='store_true', help= 'Flag to allow the model to optimize the word embeddings. Default is False.' ) parser.add_argument( '--book_dense_units', default='128', help= 'The number of neurons in the final fully-connected layers, comma separated. ' 'Default is `128`.') parser.add_argument( '--book_dropout', default=0.5, type=float, help= 'Dropout probability before final classification layer. Default is 0.5.' ) parser.add_argument( '--plateau_patience', default=16, type=int, help= 'Number of epochs to wait before dividing the learning rate by 2. Default is 16.' ) parser.add_argument( '--early_stopping_patience', default=32, type=int, help= 'Number of epochs to wait before dividing the learning rate by 2. Default is 32.' ) parser.add_argument('--epochs', default=1, type=int, help='Epochs. Default is 1.') parser.add_argument( '--save_model', action='store_true', help='Save the model and its weights. Default is False.') parser.add_argument( '--note', help= 'An optional note that will be appended to the names of generated files.' ) args = parser.parse_args() classifier_name = '{}_{}_{}_{}'.format(args.source_mode, args.net_mode, args.agg_mode, args.label_mode) start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time print('Time stamp: {:d}'.format(stamp)) if args.note is not None: print('Note: {}'.format(args.note)) base_fname = '{:d}_{}_{:d}'.format(stamp, args.note, args.category_index) else: base_fname = '{:d}_{:d}'.format(stamp, args.category_index) # Load data. print('Retrieving texts...') if args.source_mode == 'paragraph': source = 'paragraph_tokens' min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN else: # args.source_mode == 'sentence': source = 'sentence_tokens' min_len = shared_parameters.DATA_SENTENCE_MIN_LEN max_len = shared_parameters.DATA_SENTENCE_MAX_LEN subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL inputs, Y, categories, category_levels = \ bookcave.get_data({source}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, remove_stopwords=args.remove_stopwords, categories_mode=categories_mode, return_overall=return_overall) text_source_tokens = list(zip(*inputs[source]))[0] print('Retrieved {:d} texts.'.format(len(text_source_tokens))) # Reduce labels to the specified category. y = Y[args.category_index] category = categories[args.category_index] levels = category_levels[args.category_index] k = len(levels) k_train = k # Tokenize. print('Tokenizing...') max_words = shared_parameters.TEXT_MAX_WORDS split = '\t' tokenizer = tokenizers.get_tokenizer_or_fit( max_words, args.source_mode, args.remove_stopwords, text_source_tokens=text_source_tokens) # Convert to sequences. print('Converting texts to sequences...') if args.source_mode == 'paragraph': if not args.remove_stopwords: n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS else: n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS_NO_STOPWORDS else: # args.source_mode == 'sentence': n_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS padding = shared_parameters.TEXT_PADDING truncating = shared_parameters.TEXT_TRUNCATING X = [ np.array( pad_sequences(tokenizer.texts_to_sequences( [split.join(tokens) for tokens in source_tokens]), maxlen=n_tokens, padding=padding, truncating=truncating)) for source_tokens in text_source_tokens ] # Load embedding. print('Loading embedding matrix...') embedding_path = folders.EMBEDDING_GLOVE_300_PATH embedding_matrix = load_embeddings.load_embedding(tokenizer, embedding_path, max_words) # Create model. print('Creating model...') net_params = dict() if args.net_mode == 'rnn' or args.net_mode == 'rnncnn': net_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available( cuda_only=True) else GRU net_params['rnn_units'] = 128 net_params['rnn_l2'] = .001 net_params['rnn_dense_units'] = 64 net_params['rnn_dense_activation'] = 'elu' net_params['rnn_dense_l2'] = .001 net_params['rnn_agg'] = 'attention' if args.net_mode == 'cnn' or args.net_mode == 'rnncnn': net_params['cnn_filters'] = 16 net_params['cnn_filter_sizes'] = [1, 2, 3, 4] net_params['cnn_activation'] = 'elu' net_params['cnn_l2'] = .001 agg_params = dict() if args.agg_mode == 'rnn': agg_params['rnn'] = CuDNNGRU if tf.test.is_gpu_available( cuda_only=True) else GRU agg_params['rnn_units'] = 64 agg_params['rnn_l2'] = .001 book_dense_units = [ int(units) for units in args.book_dense_units.split(',') ] book_dense_activation = LeakyReLU(alpha=.1) book_dense_l2 = .001 book_dropout = args.book_dropout model = create_model(n_tokens, embedding_matrix, args.embedding_trainable, args.net_mode, net_params, args.agg_mode, agg_params, book_dense_units, book_dense_activation, book_dense_l2, book_dropout, k, category, args.label_mode) lr = 2**-16 optimizer = Adam(lr=lr) if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL: loss = 'binary_crossentropy' metric = 'binary_accuracy' elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL: loss = 'categorical_crossentropy' metric = 'categorical_accuracy' else: # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION: loss = 'mse' metric = 'accuracy' model.compile(optimizer, loss=loss, metrics=[metric]) # Split data set. print('Splitting data set...') test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE val_size = shared_parameters.EVAL_VAL_SIZE # v val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=test_random_state) X_train, X_val, y_train, y_val = \ train_test_split(X_train, y_train, test_size=val_size, random_state=val_random_state) y_val_transform = shared_parameters.transform_labels( y_val, k, args.label_mode) y_test_transform = shared_parameters.transform_labels( y_test, k, args.label_mode) # Remove classes from training set, if specified. if args.remove_classes is not None: remove_classes = sorted(list( map(int, args.remove_classes.strip().split(','))), reverse=True) for class_ in remove_classes: y_train[y_train >= class_] -= 1 k_train -= 1 # Create generators. print('Creating data generators...') train_generator = TransformBalancedBatchGenerator( np.arange(len(X_train)).reshape((len(X_train), 1)), y_train, transform_X=transform_X, transform_y=transform_y, batch_size=1, X_data=[np.array([x]) for x in X_train], k=k, label_mode=args.label_mode) val_generator = SingleInstanceBatchGenerator(X_val, y_val_transform, shuffle=False) test_generator = SingleInstanceBatchGenerator(X_test, y_test_transform, shuffle=False) # Get class weight. class_weight = shared_parameters.get_class_weight(k_train, args.label_mode, p=args.class_weight_p) # Train. print('Training for up to {:d} epoch{}...'.format( args.epochs, 's' if args.epochs != 1 else '')) plateau_monitor = 'val_loss' plateau_factor = .5 early_stopping_monitor = 'val_loss' early_stopping_min_delta = 2**-10 plateau_patience = args.plateau_patience early_stopping_patience = args.early_stopping_patience callbacks = [ ReduceLROnPlateau(monitor=plateau_monitor, factor=plateau_factor, patience=plateau_patience), EarlyStopping(monitor=early_stopping_monitor, min_delta=early_stopping_min_delta, patience=early_stopping_patience) ] if args.save_model: models_path = folders.ensure( os.path.join(folders.MODELS_PATH, classifier_name)) model_path = os.path.join(models_path, '{}.h5'.format(base_fname)) model_checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min') callbacks.append(model_checkpoint) else: model_path = None history = model.fit_generator(train_generator, epochs=args.epochs, verbose=0, callbacks=callbacks, validation_data=val_generator, class_weight=class_weight) epochs_complete = len(history.history.get('val_loss')) # Save the history to visualize loss over time. print('Saving training history...') history_path = folders.ensure( os.path.join(folders.HISTORY_PATH, classifier_name)) with open(os.path.join(history_path, '{}.txt'.format(base_fname)), 'w') as fd: for key in history.history.keys(): values = history.history.get(key) fd.write('{} {}\n'.format(key, ' '.join(str(value) for value in values))) # Predict test instances. print('Predicting test instances...') y_pred_transform = model.predict_generator(test_generator) if args.label_mode == shared_parameters.LABEL_MODE_ORDINAL: y_pred = ordinal.from_multi_hot_ordinal(y_pred_transform, threshold=.5) elif args.label_mode == shared_parameters.LABEL_MODE_CATEGORICAL: y_pred = np.argmax(y_pred_transform, axis=1) else: # args.label_mode == shared_parameters.LABEL_MODE_REGRESSION: y_pred = np.maximum(0, np.minimum(k - 1, np.round(y_pred_transform * k))) # Calculate elapsed time. end_time = int(time.time()) elapsed_s = end_time - start_time elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60 elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60 # Write results. print('Writing results...') logs_path = folders.ensure(os.path.join(folders.LOGS_PATH, classifier_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: if args.note is not None: fd.write('{}\n\n'.format(args.note)) fd.write('PARAMETERS\n\n') fd.write('category_index={:d}\n'.format(args.category_index)) fd.write('epochs={:d}\n'.format(args.epochs)) fd.write('\nHYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('remove_stopwords={}\n'.format(args.remove_stopwords)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) if args.remove_classes is not None: fd.write('remove_classes={}\n'.format(args.remove_classes)) else: fd.write('No classes removed.\n') fd.write('class_weight_p={:d}\n'.format(args.class_weight_p)) fd.write('\nTokenization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('n_tokens={:d}\n'.format(n_tokens)) fd.write('padding=\'{}\'\n'.format(padding)) fd.write('truncating=\'{}\'\n'.format(truncating)) fd.write('\nWord Embedding\n') fd.write('embedding_path=\'{}\'\n'.format(embedding_path)) fd.write('embedding_trainable={}\n'.format(args.embedding_trainable)) fd.write('\nModel\n') if args.net_mode == 'rnn' or args.net_mode == 'rnncnn': fd.write('rnn={}\n'.format(net_params['rnn'].__name__)) fd.write('rnn_units={:d}\n'.format(net_params['rnn_units'])) fd.write('rnn_l2={}\n'.format(str(net_params['rnn_l2']))) fd.write('rnn_dense_units={:d}\n'.format( net_params['rnn_dense_units'])) fd.write('rnn_dense_activation=\'{}\'\n'.format( net_params['rnn_dense_activation'])) fd.write('rnn_dense_l2={}\n'.format(str( net_params['rnn_dense_l2']))) fd.write('rnn_agg={}\n'.format(net_params['rnn_agg'])) if args.net_mode == 'cnn' or args.net_mode == 'rnncnn': fd.write('cnn_filters={:d}\n'.format(net_params['cnn_filters'])) fd.write('cnn_filter_sizes={}\n'.format( str(net_params['cnn_filter_sizes']))) fd.write('cnn_activation=\'{}\'\n'.format( net_params['cnn_activation'])) fd.write('cnn_l2={}\n'.format(str(net_params['cnn_l2']))) if args.agg_mode == 'rnn': fd.write('agg_rnn={}\n'.format(agg_params['rnn'].__name__)) fd.write('agg_rnn_units={:d}\n'.format(agg_params['rnn_units'])) fd.write('agg_rnn_l2={}\n'.format(str(agg_params['rnn_l2']))) fd.write('book_dense_units={}\n'.format(args.book_dense_units)) fd.write('book_dense_activation={} {}\n'.format( book_dense_activation.__class__.__name__, book_dense_activation.__dict__)) fd.write('book_dense_l2={}\n'.format(str(book_dense_l2))) fd.write('book_dropout={}\n'.format(str(book_dropout))) model.summary(print_fn=lambda x: fd.write('{}\n'.format(x))) fd.write('\nTraining\n') fd.write('optimizer={}\n'.format(optimizer.__class__.__name__)) fd.write('lr={}\n'.format(str(lr))) fd.write('loss=\'{}\'\n'.format(loss)) fd.write('metric=\'{}\'\n'.format(metric)) fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('val_size={}\n'.format(str(val_size))) fd.write('val_random_state={:d}\n'.format(val_random_state)) fd.write('plateau_monitor={}\n'.format(plateau_monitor)) fd.write('plateau_factor={}\n'.format(str(plateau_factor))) fd.write('plateau_patience={:d}\n'.format(plateau_patience)) fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor)) fd.write('early_stopping_min_delta={}\n'.format( str(early_stopping_min_delta))) fd.write( 'early_stopping_patience={:d}\n'.format(early_stopping_patience)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(len(X))) fd.write('Train size: {:d}\n'.format(len(X_train))) fd.write('Validation size: {:d}\n'.format(len(X_val))) fd.write('Test size: {:d}\n'.format(len(X_test))) if model_path is not None: fd.write('Model path: \'{}\'\n'.format(model_path)) else: fd.write('Model not saved.\n') fd.write('Epochs completed: {:d}\n'.format(epochs_complete)) fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format( elapsed_h, elapsed_m, elapsed_s)) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) # Write predictions. print('Writing predictions...') predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, classifier_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) print('Done.')
def main(argv): if len(argv) < 2 or len(argv) > 3: raise ValueError('Usage: <steps_per_epoch> <epochs> [note]') steps_per_epoch = int(argv[0]) epochs = int(argv[1]) note = None if len(argv) > 2: note = argv[2] script_name = os.path.basename(__file__) classifier_name = script_name[:script_name.index('.')] start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time print('Time stamp: {:d}'.format(stamp)) if note is not None: print('Note: {}'.format(note)) base_fname = '{:d}_{}'.format(stamp, note) else: base_fname = format(stamp, 'd') # Load data. print('Loading data...') subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_len = shared_parameters.DATA_SENTENCE_MIN_LEN max_len = shared_parameters.DATA_SENTENCE_MAX_LEN min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE inputs, Y, categories, category_levels = \ bookcave.get_data({'sentence_tokens'}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, categories_mode=categories_mode) text_sentence_tokens, text_section_ids, text_paragraph_ids = zip( *inputs['sentence_tokens']) print('Retrieved {:d} texts.'.format(len(text_sentence_tokens))) # Tokenize. print('Tokenizing...') max_words = shared_parameters.TEXT_MAX_WORDS split = '\t' tokenizer = Tokenizer(num_words=max_words, split=split) all_sentences = [] for sentence_tokens in text_sentence_tokens: for tokens in sentence_tokens: all_sentences.append(split.join(tokens)) tokenizer.fit_on_texts(all_sentences) print('Done.') # Convert to sequences. print('Converting texts to sequences...') n_sentences = shared_parameters.TEXT_N_SENTENCES n_sentence_tokens = shared_parameters.TEXT_N_SENTENCE_TOKENS padding = shared_parameters.TEXT_PADDING truncating = shared_parameters.TEXT_TRUNCATING text_sentence_sequences = [ pad_sequences(tokenizer.texts_to_sequences( [split.join(tokens) for tokens in sentence_tokens]), maxlen=n_sentence_tokens, padding=padding, truncating=truncating) for sentence_tokens in text_sentence_tokens ] X = [] for text_i, sentence_sequences in enumerate(text_sentence_sequences): section_ids = text_section_ids[text_i] paragraph_ids = text_paragraph_ids[text_i] n_paragraphs = len( np.unique(list( zip(text_section_ids[text_i], text_paragraph_ids[text_i])), axis=0)) x = np.zeros((n_paragraphs, n_sentences, n_sentence_tokens)) # [paragraph_i][sentence_i][token_i] paragraph_i = 0 sentence_i = 0 last_section_paragraph_id = None for sequence_i, sentence_sequence in enumerate(sentence_sequences): section_paragraph_id = (section_ids[sequence_i], paragraph_ids[sequence_i]) if last_section_paragraph_id is not None and section_paragraph_id != last_section_paragraph_id: paragraph_i += 1 sentence_i = 0 if sentence_i < n_sentences: x[paragraph_i, sentence_i] = sentence_sequence sentence_i += 1 last_section_paragraph_id = section_paragraph_id X.append(x) print('Done.') # Load embedding. print('Loading embedding matrix...') embedding_path = folders.EMBEDDING_GLOVE_300_PATH embedding_matrix = load_embeddings.load_embedding(tokenizer, embedding_path, max_words) print('Done.') # Create model. print('Creating model...') category_k = [len(levels) for levels in category_levels] embedding_trainable = False sent_rnn = CuDNNGRU if tf.test.is_gpu_available(cuda_only=True) else GRU sent_rnn_units = 128 sent_rnn_l2 = .01 sent_dense_units = 64 sent_dense_activation = 'elu' sent_dense_l2 = .01 para_rnn = CuDNNGRU if tf.test.is_gpu_available(cuda_only=True) else GRU para_rnn_units = 128 para_rnn_l2 = .01 para_dense_units = 64 para_dense_activation = 'elu' para_dense_l2 = .01 book_dense_units = 128 book_dense_activation = tf.keras.layers.LeakyReLU(alpha=.1) book_dense_l2 = .01 book_dropout = .5 label_mode = shared_parameters.LABEL_MODE_ORDINAL sentence_encoder, paragraph_encoder, model = create_model( n_sentences, n_sentence_tokens, embedding_matrix, embedding_trainable, sent_rnn, sent_rnn_units, sent_rnn_l2, sent_dense_units, sent_dense_activation, sent_dense_l2, para_rnn, para_rnn_units, para_rnn_l2, para_dense_units, para_dense_activation, para_dense_l2, book_dense_units, book_dense_activation, book_dense_l2, book_dropout, category_k, categories, label_mode) lr = 2**-16 optimizer = Adam(lr=lr) if label_mode == shared_parameters.LABEL_MODE_ORDINAL: loss = 'binary_crossentropy' metric = 'binary_accuracy' elif label_mode == shared_parameters.LABEL_MODE_CATEGORICAL: loss = 'categorical_crossentropy' metric = 'categorical_accuracy' elif label_mode == shared_parameters.LABEL_MODE_REGRESSION: loss = 'mse' metric = 'accuracy' else: raise ValueError( 'Unknown value for `1abel_mode`: {}'.format(label_mode)) model.compile(optimizer, loss=loss, metrics=[metric]) print('Done.') # Split data set. test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE val_size = shared_parameters.EVAL_VAL_SIZE # v val_random_state = shared_parameters.EVAL_VAL_RANDOM_STATE Y_T = Y.transpose() # (n, c) X_train, X_test, Y_train_T, Y_test_T = \ train_test_split(X, Y_T, test_size=test_size, random_state=test_random_state) X_train, X_val, Y_train_T, Y_val_T = \ train_test_split(X_train, Y_train_T, test_size=val_size, random_state=val_random_state) Y_train = Y_train_T.transpose() # (c, n * (1 - b) * (1 - v)) Y_val = Y_val_T.transpose() # (c, n * (1 - b) * v) Y_test = Y_test_T.transpose() # (c, n * b) # Transform labels based on the label mode. Y_train = shared_parameters.transform_labels(Y_train, category_k, label_mode) Y_val = shared_parameters.transform_labels(Y_val, category_k, label_mode) # Calculate class weights. use_class_weights = True class_weight_f = 'inverse' if use_class_weights: category_class_weights = shared_parameters.get_category_class_weights( Y_train, label_mode, f=class_weight_f) else: category_class_weights = None # Create generators. shuffle = True train_generator = SingleInstanceBatchGenerator(X_train, Y_train, shuffle=shuffle) val_generator = SingleInstanceBatchGenerator(X_val, Y_val, shuffle=False) test_generator = SingleInstanceBatchGenerator(X_test, Y_test, shuffle=False) # Train. plateau_monitor = 'val_loss' plateau_factor = .5 plateau_patience = 3 early_stopping_monitor = 'val_loss' early_stopping_min_delta = 2**-10 early_stopping_patience = 6 callbacks = [ ReduceLROnPlateau(monitor=plateau_monitor, factor=plateau_factor, patience=plateau_patience), EarlyStopping(monitor=early_stopping_monitor, min_delta=early_stopping_min_delta, patience=early_stopping_patience) ] history = model.fit_generator( train_generator, steps_per_epoch=steps_per_epoch if steps_per_epoch > 0 else None, epochs=epochs, validation_data=val_generator, class_weight=category_class_weights, callbacks=callbacks) # Save the history to visualize loss over time. print('Saving training history...') if not os.path.exists(folders.HISTORY_PATH): os.mkdir(folders.HISTORY_PATH) history_path = os.path.join(folders.HISTORY_PATH, classifier_name) if not os.path.exists(history_path): os.mkdir(history_path) with open(os.path.join(history_path, '{}.txt'.format(base_fname)), 'w') as fd: for key in history.history.keys(): values = history.history.get(key) fd.write('{} {}\n'.format(key, ' '.join(str(value) for value in values))) print('Done.') # Predict test instances. print('Predicting test instances...') Y_pred = model.predict_generator(test_generator) if label_mode == shared_parameters.LABEL_MODE_ORDINAL: Y_pred = [ ordinal.from_multi_hot_ordinal(y, threshold=.5) for y in Y_pred ] elif label_mode == shared_parameters.LABEL_MODE_CATEGORICAL: Y_pred = [np.argmax(y, axis=1) for y in Y_pred] elif label_mode == shared_parameters.LABEL_MODE_REGRESSION: Y_pred = [ np.maximum(0, np.minimum(k - 1, np.round(Y_pred[i] * k))) for i, k in enumerate(category_k) ] else: raise ValueError( 'Unknown value for `1abel_mode`: {}'.format(label_mode)) print('Done.') # Save model. save_model = False if save_model: models_path = os.path.join(folders.MODELS_PATH, classifier_name) label_mode_path = os.path.join(models_path, label_mode) model_path = os.path.join(label_mode_path, '{}.h5'.format(base_fname)) print('Saving model to `{}`...'.format(model_path)) if not os.path.exists(folders.MODELS_PATH): os.mkdir(folders.MODELS_PATH) if not os.path.exists(models_path): os.mkdir(models_path) if not os.path.exists(label_mode_path): os.mkdir(label_mode_path) model.save(model_path) print('Done.') else: model_path = None # Calculate elapsed time. end_time = int(time.time()) elapsed_s = end_time - start_time elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60 elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60 # Write results. print('Writing results...') if not os.path.exists(folders.LOGS_PATH): os.mkdir(folders.LOGS_PATH) logs_path = os.path.join(folders.LOGS_PATH, classifier_name) if not os.path.exists(logs_path): os.mkdir(logs_path) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: if note is not None: fd.write('Note: {}\n\n'.format(note)) fd.write('PARAMETERS\n\n') fd.write('steps_per_epoch={:d}\n'.format(steps_per_epoch)) fd.write('epochs={:d}\n'.format(epochs)) fd.write('\nHYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('\nTokenization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('n_sentences={:d}\n'.format(n_sentences)) fd.write('n_sentence_tokens={:d}\n'.format(n_sentence_tokens)) fd.write('padding=\'{}\'\n'.format(padding)) fd.write('truncating=\'{}\'\n'.format(truncating)) fd.write('\nWord Embedding\n') fd.write('embedding_path=\'{}\'\n'.format(embedding_path)) fd.write('embedding_trainable={}\n'.format(embedding_trainable)) fd.write('\nModel\n') fd.write('sent_rnn={}\n'.format(sent_rnn.__name__)) fd.write('sent_rnn_units={:d}\n'.format(sent_rnn_units)) fd.write('sent_rnn_l2={}\n'.format(str(sent_rnn_l2))) fd.write('sent_dense_units={:d}\n'.format(sent_dense_units)) fd.write( 'sent_dense_activation=\'{}\'\n'.format(sent_dense_activation)) fd.write('sent_dense_l2={}\n'.format(str(sent_dense_l2))) fd.write('para_rnn={}\n'.format(para_rnn.__name__)) fd.write('para_rnn_units={:d}\n'.format(para_rnn_units)) fd.write('para_rnn_l2={}\n'.format(str(para_rnn_l2))) fd.write('para_dense_units={:d}\n'.format(para_dense_units)) fd.write( 'para_dense_activation=\'{}\'\n'.format(para_dense_activation)) fd.write('para_dense_l2={}\n'.format(str(para_dense_l2))) fd.write('book_dense_units={:d}\n'.format(book_dense_units)) fd.write('book_dense_activation={} {}\n'.format( book_dense_activation.__class__.__name__, book_dense_activation.__dict__)) fd.write('book_dense_l2={}\n'.format(str(book_dense_l2))) fd.write('book_dropout={:.1f}\n'.format(book_dropout)) fd.write('label_mode={}\n'.format(label_mode)) model.summary(print_fn=lambda x: fd.write('{}\n'.format(x))) fd.write('\nTraining\n') fd.write('optimizer={}\n'.format(optimizer.__class__.__name__)) fd.write('lr={}\n'.format(str(lr))) fd.write('loss=\'{}\'\n'.format(loss)) fd.write('metric=\'{}\'\n'.format(metric)) fd.write('test_size={:.2f}\n'.format(test_size)) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('val_size={:.2f}\n'.format(val_size)) fd.write('val_random_state={:d}\n'.format(val_random_state)) fd.write('use_class_weights={}\n'.format(use_class_weights)) if use_class_weights: fd.write('class_weight_f={}\n'.format(class_weight_f)) fd.write('shuffle={}\n'.format(shuffle)) fd.write('plateau_monitor={}\n'.format(plateau_monitor)) fd.write('plateau_factor={}\n'.format(str(plateau_factor))) fd.write('plateau_patience={:d}\n'.format(plateau_patience)) fd.write('early_stopping_monitor={}\n'.format(early_stopping_monitor)) fd.write('early_stopping_min_delta={}\n'.format( str(early_stopping_min_delta))) fd.write( 'early_stopping_patience={:d}\n'.format(early_stopping_patience)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(len(X))) fd.write('Train size: {:d}\n'.format(len(X_train))) fd.write('Validation size: {:d}\n'.format(len(X_val))) fd.write('Test size: {:d}\n'.format(len(X_test))) if save_model: fd.write('Model path: \'{}\'\n'.format(model_path)) else: fd.write('Model not saved.\n') fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format( elapsed_h, elapsed_m, elapsed_s)) evaluation.write_confusion_and_metrics(Y_test, Y_pred, fd, categories) if not os.path.exists(folders.PREDICTIONS_PATH): os.mkdir(folders.PREDICTIONS_PATH) predictions_path = os.path.join(folders.PREDICTIONS_PATH, classifier_name) if not os.path.exists(predictions_path): os.mkdir(predictions_path) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(Y_test, Y_pred, fd, categories) print('Done.')
def main(): parser = ArgumentParser( description='Classify the maturity level of a book by its paragraphs.', formatter_class=RawTextHelpFormatter) parser.add_argument('classifier_name', help='The name of the classifier.') parser.add_argument('model_file_name', help='The file name of the model to load.') parser.add_argument('window', type=int, help='The paragraph window size.') args = parser.parse_args() source_mode = 'paragraph' remove_stopwords = False start_time = int(time.time()) model_file_base_name = args.model_file_name[:args.model_file_name. rindex('.')] category_index = int(model_file_base_name[-1]) base_fname = '{}_{:d}w'.format(model_file_base_name, args.window) # Load data. print('Retrieving texts...') source = 'paragraph_tokens' min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL inputs, Y, categories, category_levels = \ bookcave.get_data({source}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, remove_stopwords=remove_stopwords, categories_mode=categories_mode, return_overall=return_overall) text_source_tokens = list(zip(*inputs[source]))[0] print('Retrieved {:d} texts.'.format(len(text_source_tokens))) # Reduce labels to the specified category. y = Y[category_index] category = categories[category_index] # Tokenize. print('Tokenizing...') max_words = shared_parameters.TEXT_MAX_WORDS split = '\t' tokenizer = tokenizers.get_tokenizer_or_fit( max_words, source_mode, remove_stopwords, text_source_tokens=text_source_tokens) # Convert to sequences. print('Converting texts to sequences...') n_tokens = shared_parameters.TEXT_N_PARAGRAPH_TOKENS padding = shared_parameters.TEXT_PADDING truncating = shared_parameters.TEXT_TRUNCATING X = [ np.array( pad_sequences(tokenizer.texts_to_sequences( [split.join(tokens) for tokens in source_tokens]), maxlen=n_tokens, padding=padding, truncating=truncating)) for source_tokens in text_source_tokens ] # Load model. print('Loading model...') model_path = os.path.join(folders.MODELS_PATH, args.classifier_name, args.model_file_name) if 'rnn' in args.classifier_name: # Since `keras` was used with the custom layer, we have to reload it with `keras`. # https://github.com/keras-team/keras/issues/10907 custom_objects = {'AttentionWithContext': AttentionWithContext} model = keras.models.load_model(model_path, custom_objects=custom_objects) else: model = tf.keras.models.load_model(model_path) # Split data set. print('Splitting data set...') test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE _, X_test, _, y_test = \ train_test_split(X, y, test_size=test_size, random_state=test_random_state) # Predict instances. print('Predicting labels...') y_pred = np.zeros((len(X_test), ), dtype=np.int32) for i, x in enumerate(X_test): P = np.zeros((len(x) - args.window + 1, args.window, *x.shape[1:])) for w in range(len(P)): P[w] = x[w:w + args.window] q_pred_transform = model.predict(P) q_pred = ordinal.from_multi_hot_ordinal(q_pred_transform, threshold=.5) label_pred = max(q_pred) y_pred[i] = label_pred # Calculate elapsed time. end_time = int(time.time()) elapsed_s = end_time - start_time elapsed_m, elapsed_s = elapsed_s // 60, elapsed_s % 60 elapsed_h, elapsed_m = elapsed_m // 60, elapsed_m % 60 # Write results. print('Writing results...') logs_path = folders.ensure( os.path.join(folders.LOGS_PATH, args.classifier_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('PARAMETERS\n\n') fd.write('classifier_name={}\n'.format(args.classifier_name)) fd.write('model_file_name={}\n'.format(args.model_file_name)) fd.write('window={:d}\n'.format(args.window)) fd.write('\nHYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('remove_stopwords={}\n'.format(remove_stopwords)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nTokenization\n') fd.write('max_words={:d}\n'.format(max_words)) fd.write('n_tokens={:d}\n'.format(n_tokens)) fd.write('padding=\'{}\'\n'.format(padding)) fd.write('truncating=\'{}\'\n'.format(truncating)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(len(X))) fd.write('Test size: {:d}\n'.format(len(X_test))) fd.write('Time elapsed: {:d}h {:d}m {:d}s\n\n'.format( elapsed_h, elapsed_m, elapsed_s)) evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) # Write predictions. print('Writing predictions...') predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, args.classifier_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) print('Done.')
def main(): script_name = os.path.basename(__file__) classifier_name = script_name[:script_name.rindex('.')] start_time = int(time.time()) if 'SLURM_JOB_ID' in os.environ: stamp = int(os.environ['SLURM_JOB_ID']) else: stamp = start_time # Load data. print('Retrieving labels...') subset_ratio = shared_parameters.DATA_SUBSET_RATIO subset_seed = shared_parameters.DATA_SUBSET_SEED min_len = shared_parameters.DATA_PARAGRAPH_MIN_LEN max_len = shared_parameters.DATA_PARAGRAPH_MAX_LEN min_tokens = shared_parameters.DATA_MIN_TOKENS categories_mode = shared_parameters.DATA_CATEGORIES_MODE return_overall = shared_parameters.DATA_RETURN_OVERALL _, Y, categories, category_levels = \ bookcave.get_data({'paragraph_tokens'}, subset_ratio=subset_ratio, subset_seed=subset_seed, min_len=min_len, max_len=max_len, min_tokens=min_tokens, categories_mode=categories_mode, return_overall=return_overall) print('Retrieved {:d} labels.'.format(Y.shape[1])) # Split data set. test_size = shared_parameters.EVAL_TEST_SIZE # b test_random_state = shared_parameters.EVAL_TEST_RANDOM_STATE Y_T = Y.transpose() # (n, c) Y_train_T, Y_test_T = train_test_split(Y_T, test_size=test_size, random_state=test_random_state) Y_train = Y_train_T.transpose() # (c, n * (1 - b)) Y_test = Y_test_T.transpose() # (c, n * b) for j, category in enumerate(categories): levels = category_levels[j] y_train = Y_train[j] y_test = Y_test[j] # Predict the most common class seen in the training data. y_pred = [np.argmax(np.bincount(y_train, minlength=len(levels))) ] * len(y_test) base_fname = '{:d}_{:d}'.format(stamp, j) logs_path = folders.ensure( os.path.join(folders.LOGS_PATH, classifier_name)) with open(os.path.join(logs_path, '{}.txt'.format(base_fname)), 'w') as fd: fd.write('HYPERPARAMETERS\n') fd.write('\nText\n') fd.write('subset_ratio={}\n'.format(str(subset_ratio))) fd.write('subset_seed={}\n'.format(str(subset_seed))) fd.write('min_len={:d}\n'.format(min_len)) fd.write('max_len={:d}\n'.format(max_len)) fd.write('min_tokens={:d}\n'.format(min_tokens)) fd.write('\nLabels\n') fd.write('categories_mode=\'{}\'\n'.format(categories_mode)) fd.write('return_overall={}\n'.format(return_overall)) fd.write('\nTraining\n') fd.write('test_size={}\n'.format(str(test_size))) fd.write('test_random_state={:d}\n'.format(test_random_state)) fd.write('\nRESULTS\n\n') fd.write('Data size: {:d}\n'.format(Y.shape[1])) fd.write('Train size: {:d}\n'.format(Y_train.shape[1])) fd.write('Test size: {:d}\n'.format(Y_test.shape[1])) fd.write('\n') evaluation.write_confusion_and_metrics(y_test, y_pred, fd, category) predictions_path = folders.ensure( os.path.join(folders.PREDICTIONS_PATH, classifier_name)) with open(os.path.join(predictions_path, '{}.txt'.format(base_fname)), 'w') as fd: evaluation.write_predictions(y_test, y_pred, fd, category) print('Done.')