예제 #1
0
def run_random_model():
    log_start()
    random.seed = seed
    create_datasets()
    x_train = pd.read_csv(train)
    x_test = pd.read_csv(test)

    y = x_train["species_glc_id"]

    species_map = list(np.unique(y))
    test_glc_ids = x_test["patch_id"]
    x_test = x_test
    print("Run model...")
    species_count = len(species_map)
    fake_propabilities = [(species_count - i) / species_count
                          for i in range(species_count)]
    test_predictions = []
    for _ in tqdm(range(len(x_test.index))):
        test_predictions.append(
            _get_random_prediction(species_map, species_count,
                                   fake_propabilities))
    print("Finished.")

    print("Create test submission...")
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map,
                            test_predictions, test_glc_ids)

    print("Save test submission...")
    df.to_csv(random_submission, index=False, sep=";", header=None)
    print("Finished.", random_submission)

    log_end("Random Model", "Suffix: {}\n".format(get_suffix_pro()))
예제 #2
0
def run_probability_model():
    log_start()
    create_datasets()
    x_test = pd.read_csv(test)
    extract_species_occurences()
    species_occ = load_species_occurences()
    species = list(species_occ['species'])
    percents = list(species_occ['percents'])
    species_count = len(species)
    # create descending fake probabilities
    fake_probabilities = [(species_count - i) / species_count for i in range(species_count)]
    # sort after percents descending
    _, species_sorted = zip(*reversed(sorted(zip(percents, species))))
    # sort after species ascending
    species_map, probabilities_sorted = zip(*sorted(zip(species_sorted, list(fake_probabilities))))        
    test_glc_ids = x_test["patch_id"]
    x_test = x_test

    test_predictions = []
    for _ in tqdm(range(len(x_test.index))):
        test_predictions.append(probabilities_sorted)
    print("Finished.")

    print("Create test submission...")
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids)

    print("Save test submission...")
    df.to_csv(probability_submission, index=False, sep=";", header=None)
    print("Finished.", probability_submission)

    log_end("Probability Model")
예제 #3
0
def run_single_model():
    log_start()
    print("Running xgboost single model...")
    create_datasets()
    x_text = pd.read_csv(train)
    x_test = pd.read_csv(test)
    y = x_text["species_glc_id"]
    species_map = np.unique(y)
    species_count = len(species_map)

    x_train, x_valid, y_train, y_valid = train_test_split(x_text, y, test_size=train_val_split, random_state=seed)

    test_glc_ids = list(x_test['patch_id'])
    valid_glc_ids = list(x_valid['patch_id'])
    x_test = x_test[train_columns]
    x_train = x_train[train_columns]
    x_valid = x_valid[train_columns]

    # create data matrix for the datasets
    le = LabelEncoder().fit(y_train)
    training_labels = le.transform(y_train)
    validation_labels = le.transform(y_valid)
    d_train = xgb.DMatrix(x_train, label=training_labels)
    d_valid = xgb.DMatrix(x_valid, label=validation_labels)

    watchlist = [
        #(d_train, 'train'), 
        (d_valid, 'validation'),
    ]
            
    evaluator = top_k_error_eval(species_map, y_valid, k=20)
    # bst = xgb.Booster(model_file=path)
    
    # setting the parameters for xgboost
    params = {
        'objective': 'multi:softprob',
        'max_depth': 2,
        'seed': 4242,
        'silent': 0,
        'eval_metric': 'merror',
        'num_class': len(species_map),
        'num_boost_round': 180,
        'early_stopping_rounds': 10,
        'verbose_eval': 1,
        'updater': 'grow_gpu',
        'predictor': 'gpu_predictor',
        'tree_method': 'gpu_hist'
    }

    print("Training model...")
    bst = xgb.train(
        params,
        d_train, 
        num_boost_round=params["num_boost_round"], 
        verbose_eval=params["verbose_eval"],
        #feval=evaluator.evaluate, 
        evals=watchlist, 
        #early_stopping_rounds=params["early_stopping_rounds"]
        #callbacks=[save_after_it]
    )

    print("Save model...")
    bst.save_model(xgb_model)
    bst.dump_model(xgb_model_dump)

    #plt_features(bst, d_train)

    print("Predict test set and create submission...")    
    d_test = xgb.DMatrix(x_test)
    test_predictions = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)        
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids)
    df.to_csv(xgb_singlemodel_submission, index=False, sep=";", header=None)
    print("Finished.", xgb_singlemodel_submission)

    print("Predict & evaluate validation set...")    
    valid_predictions = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)
    print(evaluator.evaluate(valid_predictions, y_valid))
    subm = _make_submission(species_count, species_map, valid_predictions, valid_glc_ids)
    ranks = get_ranks(subm, y_valid, species_count)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
    log_end_xgb("XGBoost Single Model", train_columns, params, score)
예제 #4
0
def run_vector_model(use_multithread=True):
    log_start()
    print("Run model for testdata...")
    create_datasets()
    x_train = pd.read_csv(train)
    x_test = pd.read_csv(test)

    y = x_train["species_glc_id"]
    species = sorted(np.unique(y))
    species_count = len(species)
    print("Count of species", species_count)

    test_glc_ids = x_test["patch_id"]
    x_train = x_train[train_columns]
    x_test = x_test[train_columns]

    x_train_matrix = x_train.as_matrix()
    to_predict_matrix = x_test.as_matrix()
    fake_propabilities = [(species_count - i) / species_count
                          for i in range(species_count)]
    count_of_rows = len(to_predict_matrix)

    if use_multithread:
        num_cores = mp.cpu_count()
        print("Cpu count:", str(num_cores))
        predictions = []
        pool = mp.Pool(processes=num_cores)
        for row in range(count_of_rows):
            pool.apply_async(predict_row,
                             args=(
                                 row,
                                 to_predict_matrix,
                                 x_train_matrix,
                                 fake_propabilities,
                                 y,
                             ),
                             callback=predictions.append)
        pool.close()
        pool.join()
    else:
        predictions = []
        for row in tqdm(range(count_of_rows)):
            predictions.append(
                predict_row(row, to_predict_matrix, x_train_matrix,
                            fake_propabilities, y))

    #sort after rows
    predictions = sorted(predictions)
    _, props = zip(*predictions)
    result = np.array(props)
    assert len(predictions) == len(x_test.index)
    print("Finished.")

    print("Create test submission...")
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species, result,
                            test_glc_ids)

    print("Save test submission...")
    df.to_csv(vector_submission, index=False, sep=";", header=None)
    print("Finished.", vector_submission)

    log_end(
        "Vector Model",
        "Suffix: {}\nTraincolumns: {}\n".format(get_suffix_pro(),
                                                ", ".join(train_columns)))
def run_multi_model_with_groups(use_multithread=True):
    log_start()
    print("Running xgboost multi model with groups...")
    create_datasets()
    extract_groups()
    x_text = pd.read_csv(train_with_groups)
    extract_species_occurences()
    species_occ = load_species_occurences()
    n_groups = np.load(named_groups)
    species_occ_dict = {}
    for _, row in species_occ.iterrows():
        species_occ_dict[row["species"]] = row["percents"]

    x_test = pd.read_csv(test)
    y = x_text["species_glc_id"]

    class_names = np.unique(y)

    x_train, x_valid, y_train, y_valid = train_test_split(
        x_text, y, test_size=train_val_split, random_state=seed)
    test_glc_ids = list(x_test["patch_id"])
    valid_glc_ids = list(x_valid["patch_id"])
    x_train = x_train[train_columns]
    x_valid = x_valid[train_columns]
    x_test = x_test[train_columns]

    if use_multithread:
        num_cores = mp.cpu_count()
        print("Cpu count:", str(num_cores))
        result = Parallel(n_jobs=num_cores)(
            delayed(predict_species)(class_name, x_train, x_valid, x_test,
                                     y_train, y_valid)
            for class_name in tqdm(class_names))
    else:
        result = []
        for class_name in tqdm(class_names):
            result.append(
                predict_species(class_name, x_train, x_valid, x_test, y_train,
                                y_valid))

    species = np.array([x for x, _, _ in result])
    #transpose because each species is a column
    predictions = np.array([y for _, y, _ in result]).T
    test_predictions = np.array([z for _, _, z in result]).T

    species_map = species
    species_count = len(species_map)
    valid_predictions = predictions
    test_predictions = test_predictions

    assert len(valid_predictions) == len(y_valid.index)
    assert len(test_predictions) == len(x_test.index)
    assert len(valid_predictions[0]) == species_count
    assert len(test_predictions[0]) == species_count

    print("Create test submission...")
    df = make_submission_groups_df(TOP_N_SUBMISSION_RANKS, species_map,
                                   test_predictions, test_glc_ids, n_groups,
                                   species_occ_dict)
    df.to_csv(xgb_multimodel_groups_submission,
              index=False,
              sep=";",
              header=None)
    print("Finished.", xgb_multimodel_groups_submission)

    print("Evaluate validation set...")
    subm = _make_submission_groups(TOP_N_SUBMISSION_RANKS, species_map,
                                   valid_predictions, valid_glc_ids, n_groups,
                                   species_occ_dict)
    ranks = get_ranks(subm, y_valid, TOP_N_SUBMISSION_RANKS)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
    log_end_xgb("XGBoost Multi Model With Groups", train_columns, params,
                score)