print("Running test for model: {}".format(args.m)) # Our input $x$ TEXT = torchtext.data.Field() # Our labels $y$ LABEL = torchtext.data.Field(sequential=False, unk_token=None) # Generate train/test splits from the SST dataset, filter out neutral examples train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') TEXT.build_vocab(train) LABEL.build_vocab(train) train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits((train, val, test), batch_size=10, device=-1, repeat = False) if args.m == "NaiveBayes": alpha = 1 model = NaiveBayes(alpha, TEXT, LABEL) model.train(train_iter, val_iter) # Evaluate on training set train_acc = model.evaluate(train_iter) print('final train_acc (should be very high): ', train_acc) # Evaluate on testing set # test_code_NB(model, test_iter) test_acc = model.evaluate(test_iter) print('final test_acc: ', test_acc)
df['text'], df['is_spam'], test_size=0.2, random_state=191) print('Data set:') print('{} total'.format(df.shape[0])) for t, t_name in zip(targets, target_names): print('{} {}'.format(len(df[df['is_spam'] == t]), t_name)) print('\nTraining set:') print('{} total'.format(len(X_train))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_train]), t_name)) print('\nTest set:') print('{} total'.format(len(X_test))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_test]), t_name)) print('') # Build Classifier gvoc_model = NaiveBayes('General Vocabulary', X_train, y_train, targets, target_names) gvoc_model.train() gvoc_model.evaluate(X_test, y_test, show_top_features=10) rvoc_model = NaiveBayes('Reduced Vocabulary', X_train, y_train, targets, target_names, max_features=200) rvoc_model.train() rvoc_model.evaluate(X_test, y_test, show_top_features=10)
def evaluate_naivebayes(): nb = NaiveBayes(sys.argv[1], evaluate = True) out = nb.evaluate(sys.argv[2]) process(out, 'Naive Bayes')