def update_best(task, score, ans): files = glob(f"best/{task}@*@.txt") logger.debug(f"Have files for {files}") if len(files) == 0 or int(files[0].split('@')[1]) < score: if len(files) > 0: os.remove(files[0]) write_answer(ans, f"best/{task}@{score}@.txt")
def main(): train_x, train_y = read_data(train_fname) ID, test_x = read_data(test_fname, train_mode=False) print(ID) print(train_x) print(np.sum(train_y)) ind = random.shuffle(range(train_x[0])) # train_x, train_y, val_x, val_y = split_data(train_x, train_y, val_ratio=0.2) tr = xgb.DMatrix(train_x, label=train_y) # val = xgb.DMatrix(val_x, label=val_y) te = xgb.DMatrix(test_x) param = {'bst:max_depth':3, 'bst:eta':0.1, 'silent':0, 'objective':'binary:logistic' } param['nthread'] = 16 param['eval_metric'] = 'auc' num_round = 100 xgb.cv(param, tr, num_round, nfold=5, metrics={"error"}) model = xgb.train(param, tr, num_round, early_stopping_rounds=10) ypred = model.predict(xgmat, ntree_limit=model.best_ntree_limit) clf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_features='sqrt', max_depth=None, oob_score=False, n_jobs=-1, verbose=1) clf.fit(X=train_x, y=train_y) print(clf.score(train_x, train_y)) pred = clf.predict(X=test_x) pred_prob = clf.predict_proba(X=test_x)[:,1] write_answer(output_fname, ID, pred, print_prob=False) write_answer("prob_"+output_fname, ID, pred_prob, print_prob=True)
def main(): train_x, train_y = read_data(train_fname) ID, test_x = read_data(test_fname, train_mode=False) print(ID) print(train_x) print(np.sum(train_y)) clf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_features='sqrt', max_depth=None, oob_score=False, n_jobs=-1, verbose=1) clf.fit(X=train_x, y=train_y) print(clf.score(train_x, train_y)) pred = clf.predict(X=test_x) pred_prob = clf.predict_proba(X=test_x)[:,1] write_answer(output_fname, ID, pred, print_prob=False) write_answer("prob_"+output_fname, ID, pred_prob, print_prob=True)