def main(kmax=1, ntree=100, nruns=1, nfolds=5, tag=TAG): #Load data xtr, ytr, xte = util.load_data(as_pandas=True) # Create value-count features xall = pd.concat([xtr, xte]) xtr = count_features.range_counts(xall, xtr, kmax) xte = count_features.range_counts(xall, xte, kmax) xtr = xtr.values xte = xte.values # Create model model = XGBClassifier(n_estimators=ntree, learning_rate=0.02, gamma=1, max_depth=20, min_child_weight=0.1, subsample=0.9, colsample_bytree=0.5, seed=1) # Run CV cv_preds = models.cv_loop(xtr, ytr, model, nfolds, nruns, SEED) # Save CV predictions util.save_cv_preds(cv_preds, tag) # Fit on all of train, make final predictions on all test preds = models.rerun(xtr, ytr, xte, model, nruns, SEED) util.write_submission(preds, tag)
def predict(validation, prediction): lr = LinearRegression() v_label = validation['SalePrice'] del validation['SalePrice'] lr.fit(validation, v_label) p = lr.predict(prediction) util.write_submission("result.csv", p)
def main(): train, test = u.get_train_test_df() user_events_dict = u.get_user_events_dict(test) for user in sorted(user_events_dict): random.shuffle(user_events_dict[user]) u.write_submission("random_benchmark.csv", user_events_dict)
def run_full(): train_X, train_Y, _ = get_db_data(tags=['train1', 'train2', 'train3']) test_X, _, IDs = get_db_data(tags=['test']) m = BigModel(columns, n_est=300) R = m.train_test(train_X, train_Y, test_X) R = np.exp(R) write_submission("bigmodelv6.csv", R, IDs)
def main(): train, test = u.get_train_test_df() user_events_dict = u.get_user_events_dict(test) event_attendees = u.get_event_attendees() event_yes = u.get_event_responses_dict(event_attendees["event"], event_attendees["yes"]) for user in user_events_dict: user_events_dict[user] = sorted(user_events_dict[user], key=lambda e: len(event_yes[e]), reverse=True) u.write_submission("event_popularity_benchmark.csv", user_events_dict)
def predict(test_fea, rfs, errors): size = len([e for e in errors if e<0.5]) print 'only use recent years %d' % (size) errors, rfs = errors[:size], rfs[-size:] error_sum = np.array(errors).sum() p = None for i in range(0,len(errors)): print 'weight %f' % (errors[i]/error_sum) if i==0: p = rfs[i].predict(test_fea)*(errors[i]/error_sum) else: p = p + rfs[i].predict(test_fea)*(errors[i]/error_sum) util.write_submission("result.csv", np.array(p))
def predict(self, test_data): self.model.eval() pred = None for step, test_x in enumerate(test_data): test_x = test_x.to(self.device) out = self.model(test_x) if step == 0: pred = out else: pred = torch.cat([pred, out], 0) pred = pred.detach().cpu().numpy() write_submission(pred, self.csv)
def avg_run_all(n_models, base_model, infile_base, passes, bits, submit_id): ''' Runs a batch of linear models over the data, with the input files presented to each in a random order. Writes a submission based on the models averaged predictions. Args: n_models - the number of models to produce base_model - a model that is cloned to produce the models infile_base - bare input data name without path or extension pases - number of passes over data in training bits - the feature space shoul be of dimension 2**bits submit_id - the result is written as submissions/submission_<submit_id>.csv Writes: A submission at paths.SUBMIT/submisssion_<submit_id>.csv ''' models = [] orders = [] l = range(5) for k in range(n_models): model_k = base_model.__class__() model_k.set_params(**base_model.get_params()) models.append(model_k) random.shuffle(l) orders.append(l[:]) model_orders = zip(models, orders) for k in range(passes): print 'Pass %d' % k for (m, order) in model_orders: for file_num in order: train_set_name = '%s.%d' % (infile_base, file_num) print 'loading training file: ' + train_set_name x, y = util.load_sparse(train_set_name, n_features=2**bits, verbose=False) m.partial_fit(x, y, classes=[0., 1.]) test_set_name = infile_base + '.5' print 'loading test set...' x, y = util.load_sparse(test_set_name, n_features=2**bits, verbose=False) dvs = np.zeros((len(y), n_models)) for (k, m) in enumerate(models): dvs[:, k] = m.decision_function(x) dv = dvs.mean(axis=1) util.write_submission(dv, submit_id)
def run_all(model, infile_base, passes, bits, submit_id): ''' Takes model and trains it on 0.zip...4.zip, then predicts on 5.zip. Writes predictions out as a valid submission identified by submit_id. Args: model - the model to train and test. infile_base - bare input data name without path or extension pases - number of passes over data in training bits - the feature space shoul be of dimension 2**bits submit_id - the result is written as submissions/submission_<submit_id>.csv Writes: A submission at paths.SUBMIT/submisssion_<submit_id>.csv ''' train(model, infile_base, passes, bits) pred = test(model, infile_base, bits) util.write_submission(pred, submit_id)
def main(k=2, C=0.7, nruns=1, nfolds=5, tag=TAG): # Load data xtr, ytr, xte = util.load_data() # Create one-hot encoded features for 1..k-way interactions xtr, xte = features.range_combos(xtr, xte, k) # Create model model = LogisticRegression(C=C) # Run CV cv_preds = models.cv_loop(xtr, ytr, model, nfolds, nruns, SEED) # Save CV predictions for stacking later util.save_cv_preds(cv_preds, tag) # Fit on all of train, make final predictions on all test preds = models.rerun(xtr, ytr, xte, model, nruns, SEED) util.write_submission(preds, tag)
def main(ntree=100, nfolds=5, nruns=1, tag=TAG): # Load data _, ytr, _ = util.load_data() xtr = util.reload_cv_predictions(COLS) xte = util.reload_submissions(COLS) # Set-up model model = ExtraTreesClassifier(n_estimators=ntree, criterion='entropy', max_depth=9, max_features=6, n_jobs=3, random_state=1) # Run CV cv_preds = models.cv_loop(xtr, ytr, model, nfolds, nruns, SEED) # Save CV predictions util.save_cv_preds(cv_preds, tag) # Fit on all of train, make final predictions on all test preds = models.rerun(xtr, ytr, xte, model, nruns, SEED) util.write_submission(preds, tag)
train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) rf = RandomForestRegressor(n_estimators=50, n_jobs=1, compute_importances = True) rf.fit(train_fea, train["SalePrice"]) predictions = rf.predict(test_fea) imp = sorted(zip(train_fea.columns, rf.feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print(fea) util.write_submission("random_forest_benchmark.csv", predictions)
def main(): train, test = u.get_train_test_df() user_events_dict = u.get_user_events_dict(test) u.write_submission("given_order.csv", user_events_dict)
print "Learn Gradient Boosting" # Slower gbm = GradientBoostingRegressor(max_depth=10, subsample = .80, min_samples_split = 12, min_samples_leaf = 5, n_estimators = 200) #n_estimators = 100 by default # Cannot parallelize # if testing, this is part of training set. gbm.fit(train_fea, train_Y) print "Fitting" predictions = gbm.predict(test_fea) train_predict = gbm.predict(train_fea) rmse = np.sqrt(mean_squared_error(train_Y, train_predict)) logger.write("GBM Oob RMSE:" + str(rmse)+ "\n") print "Train Set RMSE:", rmse logger.write("Train Set RMSE:" + str(rmse)+ "\n") cPickle.dump(gbm,open( 'gbm_obj.csv','w')) if testing == 0: util.write_submission("submit_gbm_" + comment + ".csv", np.exp(predictions)) if testing == 1: csv_w_both = csv.writer(open('predictions.csv','wb')) for x in xrange(len(predictions)): csv_w_both.writerow([np.exp(predictions[x]), np.exp(test_Y[x])]) imp = sorted(zip(train_fea.columns, gbm.feature_importances_), key=lambda tup: tup[1], reverse=True) csv_w = csv.writer(open('out/rf_features_gbm_' + comment + '.csv','wb')) for fea in imp: csv_w.writerow([fea[0],fea[1]]) print "# of features", len(imp) for fea in imp: if fea[1] > 0.01: print fea[0], "|", fea[1]
} } for data_type in data_types: for i, group in enumerate(groups): tr = util.get_data(fname='tr-' + group + data_type + '.csv') ts = util.get_data(fname='ts-' + group + data_type + '.csv') for feat_type in feat_types: print data_type, group, feat_type train, test, y_tr, y_ts = format_data(group, tr, ts, feat_type) rf = RandomForestRegressor( n_estimators=800, n_jobs=4, min_samples_split=25, max_features=max_features[data_type][feat_type][i], compute_importances=True) rf.fit(train, y_tr) p = rf.predict(test) imp = sorted(zip(train.columns, rf.feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print(fea) util.write_submission( "rf" + data_type + '-' + group + '-' + feat_type + ".csv", p.tolist())
train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join( pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join( pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) rf = RandomForestRegressor(n_estimators=50, n_jobs=1, compute_importances=True) rf.fit(train_fea, train["SalePrice"]) predictions = rf.predict(test_fea) imp = sorted(zip(train_fea.columns, rf.feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print(fea) util.write_submission("random_forest_benchmark.csv", predictions)
else: tr_feats = tr_feats.join(tr[col]) ts_feats = ts_feats.join(ts[col]) return tr_feats, ts_feats, tr["SalePrice"], ts["SalePrice"] if __name__ == '__main__': data_types = ['-orig6', '-rank6'] feat_types = ['base', 'many'] groups = ['TTT', 'WL', 'TEX', 'BL', 'MG', 'SSL'] max_features = {'-orig6': {'base': [7, 5, 6, 6, 10, 6], 'many': [8, 7, 5, 8, 11, 6]}, '-rank6': {'base': [8, 6, 5, 5, 12, 6], 'many': [8, 5, 5, 5, 10, 6]}} for data_type in data_types: for i, group in enumerate(groups): tr = util.get_data(fname='tr-' + group + data_type + '.csv') ts = util.get_data(fname='ts-' + group + data_type + '.csv') for feat_type in feat_types: print data_type, group, feat_type train, test, y_tr, y_ts = format_data(group, tr, ts, feat_type) rf = RandomForestRegressor(n_estimators=800, n_jobs=4, min_samples_split=25, max_features=max_features[data_type][feat_type][i], compute_importances=True) rf.fit(train, y_tr) p = rf.predict(test) imp = sorted(zip(train.columns, rf.feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print(fea) util.write_submission("rf" + data_type + '-' + group + '-' + feat_type + ".csv", p.tolist())
import numpy as np import pandas as pd import util train, test = util.get_train_test_df() median_price = np.median(train["SalePrice"]) print("The median price is %0.2f" % median_price) util.write_submission("median_benchmark.csv", [median_price for i in range(len(test))])
import numpy as np import util train, test = util.get_train_test_df() mean_price = np.mean(train["SalePrice"]) print("The mean price is %0.2f" % mean_price) util.write_submission("mean_benchmark.csv", [mean_price for i in range(len(test))])