def get_run_components(run_dir): # Load args config = utils.load_json(os.path.join(run_dir, 'config.json')) args = Namespace(**config) # Load tokenizers X_tokenizer = data.Tokenizer.load( fp=os.path.join(run_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(run_dir, 'y_tokenizer.json')) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(run_dir, 'model.pt'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) return args, model, X_tokenizer, y_tokenizer
def predict(experiment_id, text): """Predict the class for a text using a trained model from an experiment.""" # Get experiment config experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) experiment_config = utilities.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**experiment_config) # Preprocess texts = [text] X_tokenizer = data.Tokenizer.load( fp=os.path.join(experiment_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts( texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0]*len(X_infer)) infer_set = data.TextDataset( X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN( embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) # Predict results = [] y_prob, conv_outputs = predict_step( model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes), 'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index].split(' '), conv_outputs={ k: v[index] for k, v in conv_outputs.items()}, filter_sizes=args.filter_sizes)}) return results
def get_run_components(run_dir): # Load args config = utils.load_json(os.path.join(run_dir, 'config.json')) args = Namespace(**config) # Load tokenizers with open(os.path.join(run_dir, 'X_tokenizer.json'), 'r') as fp: X_tokenizer = tokenizer_from_json(json.load(fp)) y_tokenizer = LabelEncoder() y_tokenizer.classes_ = np.load(os.path.join(run_dir, 'y_tokenizer.npy'), allow_pickle=True) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer.word_index) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes_)) model.summary(input_shape=(10, )) # build it model_path = os.path.join(run_dir, 'model/cp.ckpt') model.load_weights(model_path) # Conv output model conv_outputs_model = models.ConvOutputsModel( vocab_size=len(X_tokenizer.word_index) + 1, embedding_dim=args.embedding_dim, filter_sizes=args.filter_sizes, num_filters=args.num_filters) conv_outputs_model.summary(input_shape=(10, )) # build it # Set weights conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights()) conv_layer_start_num = 1 for layer_num in range(conv_layer_start_num, conv_layer_start_num + len(args.filter_sizes)): conv_outputs_model.layers[layer_num].set_weights( model.layers[layer_num].get_weights()) return args, model, conv_outputs_model, X_tokenizer, y_tokenizer
y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts( texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0]*len(X_infer)) infer_set = data.TextDataset( X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN( embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) # Predict results = [] y_prob, conv_outputs = predict_step( model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0],
embeddings_file = os.path.join(config.EMBEDDINGS_DIR, f'glove.6B.{args.embedding_dim}d.txt') glove_embeddings = utils.load_glove_embeddings( embeddings_file=embeddings_file) embedding_matrix = utils.make_embeddings_matrix( embeddings=glove_embeddings, token_to_index=X_tokenizer.token_to_index, embedding_dim=args.embedding_dim) config.logger.info("→ GloVe Embeddings:\n" f"{embedding_matrix.shape}") # Initialize model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes), pretrained_embeddings=embedding_matrix, freeze_embeddings=args.freeze_embeddings) model = model.to(device) config.logger.info("→ Model:\n" f" {model.named_parameters}") # Define optimizer & scheduler optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3) # Model dir
"Embedding dim must be in (50, 100, 200, 300) is using GloVe.") embeddings_file = os.path.join(config.EMBEDDINGS_DIR, f'glove.6B.{args.embedding_dim}d.txt') glove_embeddings = utils.load_glove_embeddings( embeddings_file=embeddings_file) embedding_matrix = utils.make_embeddings_matrix( embeddings=glove_embeddings, token_to_index=X_tokenizer.word_index, embedding_dim=args.embedding_dim) config.logger.info("→ Embeddings:\n" f"{embedding_matrix.shape}") # Initialize model model = models.TextCNN(vocab_size=vocab_size, embedding_dim=args.embedding_dim, filter_sizes=args.filter_sizes, num_filters=args.num_filters, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes_), freeze_embeddings=args.freeze_embeddings) model.summary(input_shape=(10, )) # build it # Set GloVe embeddings if args.use_glove: model.layers[0].set_weights([embedding_matrix]) # Model dir experiment_id = f'TextCNN_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}' experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) utilities.create_dirs(dirpath=experiment_dir) model_path = os.path.join(experiment_dir, 'model/cp.ckpt')
def predict(experiment_id, text): """Predict the class for a text using a trained model from an experiment.""" # Get experiment config experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) experiment_config = utilities.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**experiment_config) # Tokenizers texts = [text] with open(os.path.join(experiment_dir, 'X_tokenizer.json'), 'r') as fp: X_tokenizer = tokenizer_from_json(json.load(fp)) y_tokenizer = LabelEncoder() y_tokenizer.classes_ = np.load(os.path.join(experiment_dir, 'y_tokenizer.npy'), allow_pickle=True) # Create dataset generator X_infer = np.array(X_tokenizer.texts_to_sequences(texts)) preprocessed_texts = X_tokenizer.sequences_to_texts(X_infer), y_filler = np.array([0] * len(X_infer)) inference_generator = data.DataGenerator(X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max( args.filter_sizes)) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer.word_index) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes_)) model.summary(input_shape=(10, )) # build it model_path = os.path.join(experiment_dir, 'model/cp.ckpt') model.load_weights(model_path) # Conv output model conv_outputs_model = models.ConvOutputsModel( vocab_size=len(X_tokenizer.word_index) + 1, embedding_dim=args.embedding_dim, filter_sizes=args.filter_sizes, num_filters=args.num_filters) conv_outputs_model.summary(input_shape=(10, )) # build it # Set weights conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights()) conv_layer_start_num = 1 for layer_num in range(conv_layer_start_num, conv_layer_start_num + len(args.filter_sizes)): conv_outputs_model.layers[layer_num].set_weights( model.layers[layer_num].get_weights()) # Predict results = [] y_prob = model.predict(x=inference_generator, verbose=1) conv_outputs = conv_outputs_model.predict(x=inference_generator, verbose=1) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': preprocessed_texts[index][0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes_), 'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index][0].split(' '), conv_outputs=conv_outputs, filter_sizes=args.filter_sizes) }) return results