def predict(experiment_id, text): """Predict the class for a text using a trained model from an experiment.""" # Get experiment config experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id) experiment_config = utilities.load_json( os.path.join(experiment_dir, 'config.json')) args = Namespace(**experiment_config) # Preprocess texts = [text] X_tokenizer = data.Tokenizer.load( fp=os.path.join(experiment_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts( texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0]*len(X_infer)) infer_set = data.TextDataset( X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN( embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) # Predict results = [] y_prob, conv_outputs = predict_step( model=model, dataset=infer_set, filter_sizes=args.filter_sizes, device=device) for index in range(len(X_infer)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X_infer[index]])[0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes), 'top_n_grams': get_top_n_grams(tokens=preprocessed_texts[index].split(' '), conv_outputs={ k: v[index] for k, v in conv_outputs.items()}, filter_sizes=args.filter_sizes)}) return results
def predict(inputs, args, model, X_tokenizer, y_tokenizer): """Predict the class for a text using a trained model from an experiment.""" # Preprocess texts = [sample['text'] for sample in inputs] preprocessed_texts = data.preprocess_texts(texts, lower=args.lower, filters=args.filters) # Create dataset X = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0] * len(X)) dataset = data.TextDataset(X=X, y=y_filler, max_filter_size=max(args.filter_sizes)) dataloader = dataset.create_dataloader(batch_size=args.batch_size) # Predict results = [] y_prob, conv_outputs = predict_step(model=model, dataloader=dataloader, filter_sizes=args.filter_sizes, device='cpu') for index in range(len(X)): results.append({ 'raw_input': texts[index], 'preprocessed_input': X_tokenizer.sequences_to_texts([X[index]])[0], 'probabilities': get_probability_distribution(y_prob[index], y_tokenizer.classes), 'top_n_grams': get_top_n_grams( tokens=preprocessed_texts[index].split(' '), conv_outputs={k: v[index] for k, v in conv_outputs.items()}, filter_sizes=args.filter_sizes) }) return results
config.logger.info(f"→ Using {args.experiment_id}") # Preprocess texts = [args.text] X_tokenizer = data.Tokenizer.load( fp=os.path.join(experiment_dir, 'X_tokenizer.json')) y_tokenizer = data.LabelEncoder.load( fp=os.path.join(experiment_dir, 'y_tokenizer.json')) preprocessed_texts = data.preprocess_texts( texts, lower=args.lower, filters=args.filters) # Create dataset X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts)) y_filler = np.array([0]*len(X_infer)) infer_set = data.TextDataset( X=X_infer, y=y_filler, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) # Load model model = models.TextCNN( embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer), num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes)) model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5'))) device = torch.device('cuda' if ( torch.cuda.is_available() and args.cuda) else 'cpu') model = model.to(device) # Predict results = []
y_train = y_tokenizer.transform(y_train) y_val = y_tokenizer.transform(y_val) y_test = y_tokenizer.transform(y_test) config.logger.info("→ Labels to indices:\n" f" {class_} → {y_train[0]}") # Class weights counts = np.bincount(y_train) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} config.logger.info("→ class counts:\n" f" {counts}\n" "→ class weights:\n" f" {class_weights}") # Create datasets train_set = data.TextDataset(X=X_train, y=y_train, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) val_set = data.TextDataset(X=X_val, y=y_val, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) test_set = data.TextDataset(X=X_test, y=y_test, batch_size=args.batch_size, max_filter_size=max(args.filter_sizes)) batch_X, batch_y = next(iter(train_set.generate_batches())) config.logger.info("→ Data splits:\n" f" Train set:{train_set.__str__()}\n" f" Val set: {val_set.__str__()}\n" f" Test set: {test_set.__str__()}\n" "→ Sample point:\n"
y_test = y_tokenizer.transform(y_test) config.logger.info( "Labels to indices:\n" f" {class_} → {y_train[0]}") # Class weights counts = np.bincount(y_train) class_weights = {i: 1.0/count for i, count in enumerate(counts)} config.logger.info( "class counts:\n" f" {counts}\n" "class weights:\n" f" {class_weights}") # Create datasets train_dataset = data.TextDataset( X=X_train, y=y_train, max_filter_size=max(args.filter_sizes)) val_dataset = data.TextDataset( X=X_val, y=y_val, max_filter_size=max(args.filter_sizes)) test_dataset = data.TextDataset( X=X_test, y=y_test, max_filter_size=max(args.filter_sizes)) config.logger.info( "Data splits:\n" f" Train dataset:{train_dataset.__str__()}\n" f" Val dataset: {val_dataset.__str__()}\n" f" Test dataset: {test_dataset.__str__()}\n" "Sample point:\n" f" {train_dataset[0]}") # Create dataloaders train_dataloader = train_dataset.create_dataloader( batch_size=args.batch_size)