def test(toxic_data, model=None): # Create the paths for the data ids, test_data = toxic_data.load_test() assert not test_data.output_labels logging.info("Test Data: %s samples" % len(test_data)) # And create the model if model is None: model = load_model(args.model) # Kfold prediction if args.kfold: predictions = 0. for i in range(args.kfold): logging.info("Predicting fold %s" % i) # Get the predictions predictions = predictions + test_model( model, TRAIN_ID + "_fold%s" % i, test_data, args.test_batch_size) predictions = predictions / args.kfold else: predictions = test_model(model, TRAIN_ID, test_data, args.test_batch_size) model_file, submission_file, log_file = create_filenames(TRAIN_ID) # Run the postprocessing logging.info("Running postprocessing: %s" % args.postprocessing) predictions = postprocessing[args.postprocessing](predictions) ToxicData.save_submission(submission_file, ids, predictions)
def load_dataset(data_path): train_path = os.path.join(data_path, "train.npz") test_path = os.path.join(data_path, "test.npz") dictionary_path = os.path.join(data_path, "word_index.pkl") # Load the data toxic = ToxicData(train_path, test_path, dictionary_path) train_ids, train_dataset = toxic.load_train(mode="sup") return train_dataset
def test(toxic_data): # Create the paths for the data ids, test_data = toxic_data.load_test() assert not test_data.output_labels # And create the generators testgen = DatasetGenerator(test_data, batch_size=args.test_batch_size, shuffle=False) # Initialize the model model = load_model(args.model) model.load_state(MODEL_FILE) # Get the predictions predictions = model.predict_generator(testgen, testgen.steps_per_epoch, verbose=1) ToxicData.save_submission(SUBMISSION_FILE, ids, predictions)
best_val = float("inf") best_C = None for C in Cs: logging.info("Fitting {} with C={}".format(LABEL_NAMES[i], C)) model = NbSvmClassifier(C=C, verbose=0).fit(trn_term_doc, train_labels[:, i]) # Evaluate the model val_preds = model.predict_proba(val_term_doc)[:, 1] score = log_loss(val_labels[:, i], val_preds) logging.info("Model had val score of %s" % score) if score < best_val: logging.info("New minimum score improved from {} to {}".format( best_val, score)) best_models[i] = model best_val = score best_C = C scores[i] = best_val logging.info("Best score for {} with C={} is {}".format( LABEL_NAMES[i], best_C, scores[i])) logging.info("Average Val Score is %s" % np.average(scores)) # Now with the best models, we run on the test data and produce our submission test_preds = np.empty((test_ids.shape[0], len(LABEL_NAMES))) logging.info("Creating test predictions...") for i in range(train_labels.shape[1]): test_preds[:, i] = best_models[i].predict_proba(test_term_doc)[:, 1] # Create the submission logging.info("Saving the predictions in a submission file") ToxicData.save_submission(os.path.join("../submissions", TRAIN_ID + ".csv"), test_ids, test_preds)
# And create the generators testgen = DatasetGenerator(test_data, batch_size=args.test_batch_size, shuffle=False) # Initialize the model model = load_model(args.model) model.load_state(MODEL_FILE) # Get the predictions predictions = model.predict_generator(testgen, testgen.steps_per_epoch, verbose=1) ToxicData.save_submission(SUBMISSION_FILE, ids, predictions) if __name__ == "__main__": # Create the paths for the data train_path = os.path.join(args.data, "train.npz") test_path = os.path.join(args.data, "test.npz") dictionary_path = os.path.join(args.data, "word_index.pkl") # Load the data toxic = ToxicData(train_path, test_path, dictionary_path) if args.train: train(toxic) if args.test: test(toxic)
# Read in all the submission values submissions = [ pd.read_csv(sub_fname)[LABEL_NAMES].values for sub_fname in submission_fnames ] # Combine them based on their respective weights combined = 0 for weight, sub in zip(weights, submissions): combined = combined + weight * sub return ids, combined if __name__ == "__main__": # Load the ensemble config logging.info("Opening the ensemble configs") ensemble_config_dict = load_ensemble_configs() ensemble_config = get_ensemble_config(ensemble_config_dict, args.ensemble_id) # Gather the filenames submission_fnames = [ os.path.join("../submissions/", fname + ".csv") for fname in ensemble_config["files"] ] logging.info("Files: {}".format(submission_fnames)) # Ensemble the submissions ids, combined = ensemble_submissions(submission_fnames, weights=ensemble_config["weights"]) ToxicData.save_submission( os.path.join("../submissions/", "ensemble_" + args.ensemble_id + ".csv"), ids, combined)
for model_name in self.submodel_names ] logging.info("Using submissions {submission_fnames}".format( submission_fnames=submission_fnames)) # Get the ids ids = pd.read_csv(submission_fnames[0])['id'].values submissions = np.stack([ pd.read_csv(sub_fname)[LABEL_NAMES].values for sub_fname in submission_fnames ], axis=1) return ids, submissions def test(self, *args, **kwargs): ids, submissions = self.load_submissions() test_preds = self.predict(submissions, *args, **kwargs) return ids, test_preds if __name__ == "__main__": args = parser.parse_args() parallel = Parallel(args.kfold, backend="threading", verbose=0) trainer = EnsembleTrainer(args.ensemble_id, args.seed, args.kfold) trainer.fit(parallel=parallel) trainer.save_model() trainer.save_validation() ids, test_preds = trainer.test() ToxicData.save_submission( os.path.join("../submissions", args.ensemble_id + ".csv"), ids, test_preds)
ToxicData.save_submission(submission_file, ids, predictions) if __name__ == "__main__": # Create the paths for the data train_path = os.path.join(args.data, "train.npz") test_path = os.path.join(args.data, "test.npz") dictionary_path = os.path.join(args.data, "word_index.pkl") if args.use_augmented: augmented_path = os.path.join(args.data, "train_*.npz") else: augmented_path = "" # Load the data toxic = ToxicData(train_path, test_path, dictionary_path, augmented_path=augmented_path, original_prob=args.original_prob, fixed_len=args.fixed_len) model = None if args.train: if args.kfold: model = kfold(toxic) else: model = train(toxic) if args.test: test(toxic, model=model)
ensemble_config_dict = load_ensemble_configs() ensemble_config = get_ensemble_config(ensemble_config_dict, args.ensemble_id) pred_savepath = safe_open_dir("../superlearner_preds/") # Get the predictions pred_x, pred_y = create_preds(ensemble_config["files"], ensemble_config["data"], batch_size=args.batch_size, k=args.kfold, seed=SEED, savedir=pred_savepath) # Train the meta-learner if args.superlearn: if args.use_sklearn: logging.info("Training superlearner with scikit-learn") weights, mus, sigmas = train_superlearner_sklearn(pred_x, pred_y, reg_type=args.penalty, C=args.C) mus, sigmas = None, None elif args.use_xgboost: logging.info("Training superlearner with xgboost") gbm = train_superlearner_xgboost(pred_x, pred_y, ensemble_config['params']) else: weights = train_superlearner(pred_x, pred_y) mus, sigmas = None, None # Run the ensembling submission_fnames = [os.path.join("../submissions/", fname + ".csv") for fname in ensemble_config["files"]] logging.info("Using submission_fnames: " + str(submission_fnames)) if args.use_xgboost: test_ids, combined_preds = ensemble_submissions2(submission_fnames, gbm) else: test_ids, combined_preds = ensemble_submissions(submission_fnames, weights, mus, sigmas) logging.info("Combined preds shape: {}".format(combined_preds.shape)) ToxicData.save_submission(os.path.join("../submissions/", "superlearner_" + args.ensemble_id + ".csv"), test_ids, combined_preds)