def run_random_model(): log_start() random.seed = seed create_datasets() x_train = pd.read_csv(train) x_test = pd.read_csv(test) y = x_train["species_glc_id"] species_map = list(np.unique(y)) test_glc_ids = x_test["patch_id"] x_test = x_test print("Run model...") species_count = len(species_map) fake_propabilities = [(species_count - i) / species_count for i in range(species_count)] test_predictions = [] for _ in tqdm(range(len(x_test.index))): test_predictions.append( _get_random_prediction(species_map, species_count, fake_propabilities)) print("Finished.") print("Create test submission...") df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids) print("Save test submission...") df.to_csv(random_submission, index=False, sep=";", header=None) print("Finished.", random_submission) log_end("Random Model", "Suffix: {}\n".format(get_suffix_pro()))
def run_probability_model(): log_start() create_datasets() x_test = pd.read_csv(test) extract_species_occurences() species_occ = load_species_occurences() species = list(species_occ['species']) percents = list(species_occ['percents']) species_count = len(species) # create descending fake probabilities fake_probabilities = [(species_count - i) / species_count for i in range(species_count)] # sort after percents descending _, species_sorted = zip(*reversed(sorted(zip(percents, species)))) # sort after species ascending species_map, probabilities_sorted = zip(*sorted(zip(species_sorted, list(fake_probabilities)))) test_glc_ids = x_test["patch_id"] x_test = x_test test_predictions = [] for _ in tqdm(range(len(x_test.index))): test_predictions.append(probabilities_sorted) print("Finished.") print("Create test submission...") df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids) print("Save test submission...") df.to_csv(probability_submission, index=False, sep=";", header=None) print("Finished.", probability_submission) log_end("Probability Model")
def run_single_model(): log_start() print("Running xgboost single model...") create_datasets() x_text = pd.read_csv(train) x_test = pd.read_csv(test) y = x_text["species_glc_id"] species_map = np.unique(y) species_count = len(species_map) x_train, x_valid, y_train, y_valid = train_test_split(x_text, y, test_size=train_val_split, random_state=seed) test_glc_ids = list(x_test['patch_id']) valid_glc_ids = list(x_valid['patch_id']) x_test = x_test[train_columns] x_train = x_train[train_columns] x_valid = x_valid[train_columns] # create data matrix for the datasets le = LabelEncoder().fit(y_train) training_labels = le.transform(y_train) validation_labels = le.transform(y_valid) d_train = xgb.DMatrix(x_train, label=training_labels) d_valid = xgb.DMatrix(x_valid, label=validation_labels) watchlist = [ #(d_train, 'train'), (d_valid, 'validation'), ] evaluator = top_k_error_eval(species_map, y_valid, k=20) # bst = xgb.Booster(model_file=path) # setting the parameters for xgboost params = { 'objective': 'multi:softprob', 'max_depth': 2, 'seed': 4242, 'silent': 0, 'eval_metric': 'merror', 'num_class': len(species_map), 'num_boost_round': 180, 'early_stopping_rounds': 10, 'verbose_eval': 1, 'updater': 'grow_gpu', 'predictor': 'gpu_predictor', 'tree_method': 'gpu_hist' } print("Training model...") bst = xgb.train( params, d_train, num_boost_round=params["num_boost_round"], verbose_eval=params["verbose_eval"], #feval=evaluator.evaluate, evals=watchlist, #early_stopping_rounds=params["early_stopping_rounds"] #callbacks=[save_after_it] ) print("Save model...") bst.save_model(xgb_model) bst.dump_model(xgb_model_dump) #plt_features(bst, d_train) print("Predict test set and create submission...") d_test = xgb.DMatrix(x_test) test_predictions = bst.predict(d_test, ntree_limit=bst.best_ntree_limit) df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids) df.to_csv(xgb_singlemodel_submission, index=False, sep=";", header=None) print("Finished.", xgb_singlemodel_submission) print("Predict & evaluate validation set...") valid_predictions = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit) print(evaluator.evaluate(valid_predictions, y_valid)) subm = _make_submission(species_count, species_map, valid_predictions, valid_glc_ids) ranks = get_ranks(subm, y_valid, species_count) score = mrr_score(ranks) print("MRR-Score:", score * 100, "%") log_end_xgb("XGBoost Single Model", train_columns, params, score)
def run_vector_model(use_multithread=True): log_start() print("Run model for testdata...") create_datasets() x_train = pd.read_csv(train) x_test = pd.read_csv(test) y = x_train["species_glc_id"] species = sorted(np.unique(y)) species_count = len(species) print("Count of species", species_count) test_glc_ids = x_test["patch_id"] x_train = x_train[train_columns] x_test = x_test[train_columns] x_train_matrix = x_train.as_matrix() to_predict_matrix = x_test.as_matrix() fake_propabilities = [(species_count - i) / species_count for i in range(species_count)] count_of_rows = len(to_predict_matrix) if use_multithread: num_cores = mp.cpu_count() print("Cpu count:", str(num_cores)) predictions = [] pool = mp.Pool(processes=num_cores) for row in range(count_of_rows): pool.apply_async(predict_row, args=( row, to_predict_matrix, x_train_matrix, fake_propabilities, y, ), callback=predictions.append) pool.close() pool.join() else: predictions = [] for row in tqdm(range(count_of_rows)): predictions.append( predict_row(row, to_predict_matrix, x_train_matrix, fake_propabilities, y)) #sort after rows predictions = sorted(predictions) _, props = zip(*predictions) result = np.array(props) assert len(predictions) == len(x_test.index) print("Finished.") print("Create test submission...") df = make_submission_df(TOP_N_SUBMISSION_RANKS, species, result, test_glc_ids) print("Save test submission...") df.to_csv(vector_submission, index=False, sep=";", header=None) print("Finished.", vector_submission) log_end( "Vector Model", "Suffix: {}\nTraincolumns: {}\n".format(get_suffix_pro(), ", ".join(train_columns)))
def run_multi_model_with_groups(use_multithread=True): log_start() print("Running xgboost multi model with groups...") create_datasets() extract_groups() x_text = pd.read_csv(train_with_groups) extract_species_occurences() species_occ = load_species_occurences() n_groups = np.load(named_groups) species_occ_dict = {} for _, row in species_occ.iterrows(): species_occ_dict[row["species"]] = row["percents"] x_test = pd.read_csv(test) y = x_text["species_glc_id"] class_names = np.unique(y) x_train, x_valid, y_train, y_valid = train_test_split( x_text, y, test_size=train_val_split, random_state=seed) test_glc_ids = list(x_test["patch_id"]) valid_glc_ids = list(x_valid["patch_id"]) x_train = x_train[train_columns] x_valid = x_valid[train_columns] x_test = x_test[train_columns] if use_multithread: num_cores = mp.cpu_count() print("Cpu count:", str(num_cores)) result = Parallel(n_jobs=num_cores)( delayed(predict_species)(class_name, x_train, x_valid, x_test, y_train, y_valid) for class_name in tqdm(class_names)) else: result = [] for class_name in tqdm(class_names): result.append( predict_species(class_name, x_train, x_valid, x_test, y_train, y_valid)) species = np.array([x for x, _, _ in result]) #transpose because each species is a column predictions = np.array([y for _, y, _ in result]).T test_predictions = np.array([z for _, _, z in result]).T species_map = species species_count = len(species_map) valid_predictions = predictions test_predictions = test_predictions assert len(valid_predictions) == len(y_valid.index) assert len(test_predictions) == len(x_test.index) assert len(valid_predictions[0]) == species_count assert len(test_predictions[0]) == species_count print("Create test submission...") df = make_submission_groups_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids, n_groups, species_occ_dict) df.to_csv(xgb_multimodel_groups_submission, index=False, sep=";", header=None) print("Finished.", xgb_multimodel_groups_submission) print("Evaluate validation set...") subm = _make_submission_groups(TOP_N_SUBMISSION_RANKS, species_map, valid_predictions, valid_glc_ids, n_groups, species_occ_dict) ranks = get_ranks(subm, y_valid, TOP_N_SUBMISSION_RANKS) score = mrr_score(ranks) print("MRR-Score:", score * 100, "%") log_end_xgb("XGBoost Multi Model With Groups", train_columns, params, score)