示例#1
0
def test(toxic_data, model=None):
    # Create the paths for the data
    ids, test_data = toxic_data.load_test()
    assert not test_data.output_labels
    logging.info("Test Data: %s samples" % len(test_data))

    # And create the model
    if model is None:
        model = load_model(args.model)

    # Kfold prediction
    if args.kfold:
        predictions = 0.
        for i in range(args.kfold):
            logging.info("Predicting fold %s" % i)
            # Get the predictions
            predictions = predictions + test_model(
                model, TRAIN_ID + "_fold%s" % i, test_data,
                args.test_batch_size)
        predictions = predictions / args.kfold
    else:
        predictions = test_model(model, TRAIN_ID, test_data,
                                 args.test_batch_size)

    model_file, submission_file, log_file = create_filenames(TRAIN_ID)
    # Run the postprocessing
    logging.info("Running postprocessing: %s" % args.postprocessing)
    predictions = postprocessing[args.postprocessing](predictions)
    ToxicData.save_submission(submission_file, ids, predictions)
示例#2
0
def load_dataset(data_path):
    train_path = os.path.join(data_path, "train.npz")
    test_path = os.path.join(data_path, "test.npz")
    dictionary_path = os.path.join(data_path, "word_index.pkl")
    # Load the data
    toxic = ToxicData(train_path, test_path, dictionary_path)
    train_ids, train_dataset = toxic.load_train(mode="sup")
    return train_dataset
def test(toxic_data):
    # Create the paths for the data
    ids, test_data = toxic_data.load_test()
    assert not test_data.output_labels

    # And create the generators
    testgen = DatasetGenerator(test_data,
                               batch_size=args.test_batch_size,
                               shuffle=False)

    # Initialize the model
    model = load_model(args.model)
    model.load_state(MODEL_FILE)

    # Get the predictions
    predictions = model.predict_generator(testgen,
                                          testgen.steps_per_epoch,
                                          verbose=1)
    ToxicData.save_submission(SUBMISSION_FILE, ids, predictions)
示例#4
0
    best_val = float("inf")
    best_C = None
    for C in Cs:
        logging.info("Fitting {} with C={}".format(LABEL_NAMES[i], C))
        model = NbSvmClassifier(C=C, verbose=0).fit(trn_term_doc,
                                                    train_labels[:, i])
        # Evaluate the model
        val_preds = model.predict_proba(val_term_doc)[:, 1]
        score = log_loss(val_labels[:, i], val_preds)
        logging.info("Model had val score of %s" % score)
        if score < best_val:
            logging.info("New minimum score improved from {} to {}".format(
                best_val, score))
            best_models[i] = model
            best_val = score
            best_C = C
    scores[i] = best_val
    logging.info("Best score for {} with C={} is {}".format(
        LABEL_NAMES[i], best_C, scores[i]))
logging.info("Average Val Score is %s" % np.average(scores))

# Now with the best models, we run on the test data and produce our submission
test_preds = np.empty((test_ids.shape[0], len(LABEL_NAMES)))
logging.info("Creating test predictions...")
for i in range(train_labels.shape[1]):
    test_preds[:, i] = best_models[i].predict_proba(test_term_doc)[:, 1]

# Create the submission
logging.info("Saving the predictions in a submission file")
ToxicData.save_submission(os.path.join("../submissions", TRAIN_ID + ".csv"),
                          test_ids, test_preds)
    # And create the generators
    testgen = DatasetGenerator(test_data,
                               batch_size=args.test_batch_size,
                               shuffle=False)

    # Initialize the model
    model = load_model(args.model)
    model.load_state(MODEL_FILE)

    # Get the predictions
    predictions = model.predict_generator(testgen,
                                          testgen.steps_per_epoch,
                                          verbose=1)
    ToxicData.save_submission(SUBMISSION_FILE, ids, predictions)


if __name__ == "__main__":
    # Create the paths for the data
    train_path = os.path.join(args.data, "train.npz")
    test_path = os.path.join(args.data, "test.npz")
    dictionary_path = os.path.join(args.data, "word_index.pkl")

    # Load the data
    toxic = ToxicData(train_path, test_path, dictionary_path)

    if args.train:
        train(toxic)
    if args.test:
        test(toxic)
    # Read in all the submission values
    submissions = [
        pd.read_csv(sub_fname)[LABEL_NAMES].values
        for sub_fname in submission_fnames
    ]
    # Combine them based on their respective weights
    combined = 0
    for weight, sub in zip(weights, submissions):
        combined = combined + weight * sub
    return ids, combined


if __name__ == "__main__":
    # Load the ensemble config
    logging.info("Opening the ensemble configs")
    ensemble_config_dict = load_ensemble_configs()
    ensemble_config = get_ensemble_config(ensemble_config_dict,
                                          args.ensemble_id)
    # Gather the filenames
    submission_fnames = [
        os.path.join("../submissions/", fname + ".csv")
        for fname in ensemble_config["files"]
    ]
    logging.info("Files: {}".format(submission_fnames))
    # Ensemble the submissions
    ids, combined = ensemble_submissions(submission_fnames,
                                         weights=ensemble_config["weights"])
    ToxicData.save_submission(
        os.path.join("../submissions/",
                     "ensemble_" + args.ensemble_id + ".csv"), ids, combined)
示例#7
0
            for model_name in self.submodel_names
        ]
        logging.info("Using submissions {submission_fnames}".format(
            submission_fnames=submission_fnames))
        # Get the ids
        ids = pd.read_csv(submission_fnames[0])['id'].values
        submissions = np.stack([
            pd.read_csv(sub_fname)[LABEL_NAMES].values
            for sub_fname in submission_fnames
        ],
                               axis=1)
        return ids, submissions

    def test(self, *args, **kwargs):
        ids, submissions = self.load_submissions()
        test_preds = self.predict(submissions, *args, **kwargs)
        return ids, test_preds


if __name__ == "__main__":
    args = parser.parse_args()
    parallel = Parallel(args.kfold, backend="threading", verbose=0)
    trainer = EnsembleTrainer(args.ensemble_id, args.seed, args.kfold)
    trainer.fit(parallel=parallel)
    trainer.save_model()
    trainer.save_validation()
    ids, test_preds = trainer.test()
    ToxicData.save_submission(
        os.path.join("../submissions", args.ensemble_id + ".csv"), ids,
        test_preds)
示例#8
0
    ToxicData.save_submission(submission_file, ids, predictions)


if __name__ == "__main__":
    # Create the paths for the data
    train_path = os.path.join(args.data, "train.npz")
    test_path = os.path.join(args.data, "test.npz")
    dictionary_path = os.path.join(args.data, "word_index.pkl")
    if args.use_augmented:
        augmented_path = os.path.join(args.data, "train_*.npz")
    else:
        augmented_path = ""

    # Load the data
    toxic = ToxicData(train_path,
                      test_path,
                      dictionary_path,
                      augmented_path=augmented_path,
                      original_prob=args.original_prob,
                      fixed_len=args.fixed_len)

    model = None
    if args.train:
        if args.kfold:
            model = kfold(toxic)

        else:
            model = train(toxic)
    if args.test:
        test(toxic, model=model)
示例#9
0
    ensemble_config_dict = load_ensemble_configs()
    ensemble_config = get_ensemble_config(ensemble_config_dict, args.ensemble_id)
    pred_savepath = safe_open_dir("../superlearner_preds/")
    # Get the predictions
    pred_x, pred_y = create_preds(ensemble_config["files"], ensemble_config["data"], batch_size=args.batch_size,
                                  k=args.kfold, seed=SEED, savedir=pred_savepath)

    # Train the meta-learner
    if args.superlearn:
        if args.use_sklearn:
            logging.info("Training superlearner with scikit-learn")
            weights, mus, sigmas = train_superlearner_sklearn(pred_x, pred_y, reg_type=args.penalty, C=args.C)
            mus, sigmas = None, None
        elif args.use_xgboost:
            logging.info("Training superlearner with xgboost")
            gbm = train_superlearner_xgboost(pred_x, pred_y, ensemble_config['params'])
        else:
            weights = train_superlearner(pred_x, pred_y)
            mus, sigmas = None, None

        # Run the ensembling
        submission_fnames = [os.path.join("../submissions/", fname + ".csv") for fname in ensemble_config["files"]]
        logging.info("Using submission_fnames: " + str(submission_fnames))
        if args.use_xgboost:
            test_ids, combined_preds = ensemble_submissions2(submission_fnames, gbm)
        else:
            test_ids, combined_preds = ensemble_submissions(submission_fnames, weights, mus, sigmas)
        logging.info("Combined preds shape: {}".format(combined_preds.shape))
        ToxicData.save_submission(os.path.join("../submissions/", "superlearner_" + args.ensemble_id + ".csv"),
                                  test_ids, combined_preds)