#["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features).astype(np.int64) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features( "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) salaries = dio.get_salaries(type_n, log=True).astype(np.int64) if not submission: valid_salaries = dio.get_salaries(type_v, log=True) best_predictions = dio.get_prediction( model_name="ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log", type_n="valid") par = " classed from 0-11500 then 4 classes to 100 000 and to end NoNormal classTypeTime" def encode_salaries(salaries, bins): bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True) #hist, bin_edges = np.histogram(salaries, bins) print np.diff(bin_edges) idxs = np.searchsorted(bin_edges, salaries, side="right") return idxs #salaries_enc = encode_salaries(salaries, 4) #valid_salaries_enc = encode_salaries(valid_salaries, 4)
name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new" % (n_trees) model_names.append(name) name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (n_trees) model_names.append(name) model_names.append("vowpall") model_names.append("vowpall_loc5") #fit_predict(model2) #fit_predict(model1) #fit_predict(model3) #fit_predict(model6, "", "", "") all_model_predictions = [] for model_name in model_names: model_predictions = dio.get_prediction(model_name=model_name, type_n=type_v) #print model_predictions[0] if not model_name.endswith("log") and not model_name.startswith("vowpall"): model_predictions = np.log(model_predictions) #if model_name.startswith("vowpall"): #model_predictions = np.log(model_predictions) #print model_predictions[0] print "%s\nMAE: %f\n" % (model_name, mean_absolute_error(valid_salaries, np.exp(model_predictions))) all_model_predictions.append(model_predictions) predictions = np.vstack(all_model_predictions).T predictions = np.exp(predictions) #predictions = np.random.randint(0,5, size=(10,3)) print predictions.shape print predictions[1:10, :] indexes = range(0, len(model_names))
#salaries_enc = encode_salaries(salaries, 4) #valid_salaries_enc = encode_salaries(valid_salaries, 4) print salaries.shape metric = dio.error_metric for bins in [4]: #range(10,15): n_trees = 10 #salaries_enc = encode_salaries(salaries, bins) #valid_salaries_enc = encode_salaries(valid_salaries, bins) salaries_enc = dio.get_prediction(model_name="randomForest_tfidf_titleFullLoc_bin4", type_n="train_classes") valid_salaries_enc = dio.get_prediction(model_name="randomForest_tfidf_titleFullLoc_bin4", type_n="valid_classes") par = " classed from 0-11500 then %d classes to 100 000 and to end NoNormal classTypeTime salaries and valid predicted with randomForest_tfidf_titleFullLoc_bin4" % (bins,) name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_count_rf10_%dsplit_new_log" % (min_samples_split, n_trees, bins) print name num_classes = salaries_enc.max() print "classes:", num_classes def predict(class_id): print "predicting: ", class_id salaries_idx = np.where(salaries_enc == class_id) valid_idx = np.where(valid_salaries_enc == class_id) if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0: return [], None
#fit_predict(model5) #fit_predict(model4, features, salaries, validation_features, type_n="test_subm") model_name = "predictions_submit_test.txt" predictions = np.loadtxt(path_join(dio.data_dir, "code", "from_fastml", "optional", model_name)) dio.save_prediction("vowpall_submission", predictions, type_v) model_name = "predictions_submit_test_loc5.txt" predictions = np.loadtxt(path_join(dio.data_dir, "code", "from_fastml", "optional", model_name)) dio.save_prediction("vowpall_loc5", predictions, type_v) all_model_predictions = [] for model_name in model_names: #fit_predict(model_name, features, salaries, validation_features, type_n="test_subm") model_predictions = dio.get_prediction(model_name=model_name, type_n="test_full") if not model_name.endswith("log") and not model_name.startswith("vowpall"): model_predictions = np.log(model_predictions) print "modelp", model_predictions.shape #print "%s\nMAE: %f\n" % (model_name, log_mean_absolute_error(np.log(valid_salaries), model_predictions)) all_model_predictions.append(model_predictions) predictions = np.vstack(all_model_predictions).T predictions = np.exp(predictions) #predictions = np.random.randint(0,5, size=(10,3)) print predictions.shape print predictions[1:10, :] result = predictions.mean(axis=1) model_name = "-".join(model_names) model_name = "vowpal_loc5-extra30_40log-extra40-extra40tfidf2log-mean-test"
#["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features).astype(np.int64) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) salaries = dio.get_salaries(type_n, log=True).astype(np.int64) if not submission: valid_salaries = dio.get_salaries(type_v, log=True) best_predictions = dio.get_prediction(model_name="ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log", type_n="valid") par = " classed from 0-11500 then 4 classes to 100 000 and to end NoNormal classTypeTime" def encode_salaries(salaries, bins): bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True) #hist, bin_edges = np.histogram(salaries, bins) print np.diff(bin_edges) idxs = np.searchsorted(bin_edges, salaries, side="right") return idxs #salaries_enc = encode_salaries(salaries, 4) #valid_salaries_enc = encode_salaries(valid_salaries, 4)
n_trees) model_names.append(name) name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % ( n_trees) model_names.append(name) model_names.append("vowpall") model_names.append("vowpall_loc5") #fit_predict(model2) #fit_predict(model1) #fit_predict(model3) #fit_predict(model6, "", "", "") all_model_predictions = [] for model_name in model_names: model_predictions = dio.get_prediction(model_name=model_name, type_n=type_v) #print model_predictions[0] if not model_name.endswith("log") and not model_name.startswith("vowpall"): model_predictions = np.log(model_predictions) #if model_name.startswith("vowpall"): #model_predictions = np.log(model_predictions) #print model_predictions[0] print "%s\nMAE: %f\n" % (model_name, mean_absolute_error(valid_salaries, np.exp(model_predictions))) all_model_predictions.append(model_predictions) predictions = np.vstack(all_model_predictions).T predictions = np.exp(predictions) #predictions = np.random.randint(0,5, size=(10,3)) print predictions.shape print predictions[1:10, :]
idxs = np.searchsorted(bin_edges, salaries, side="right") return idxs #salaries_enc = encode_salaries(salaries, 4) #valid_salaries_enc = encode_salaries(valid_salaries, 4) print salaries.shape metric = dio.error_metric for bins in [4]: #range(10,15): n_trees = 10 #salaries_enc = encode_salaries(salaries, bins) #valid_salaries_enc = encode_salaries(valid_salaries, bins) salaries_enc = dio.get_prediction( model_name="randomForest_tfidf_titleFullLoc_bin4", type_n="train_classes") valid_salaries_enc = dio.get_prediction( model_name="randomForest_tfidf_titleFullLoc_bin4", type_n="valid_classes") par = " classed from 0-11500 then %d classes to 100 000 and to end NoNormal classTypeTime salaries and valid predicted with randomForest_tfidf_titleFullLoc_bin4" % ( bins, ) name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_count_rf10_%dsplit_new_log" % ( min_samples_split, n_trees, bins) print name num_classes = salaries_enc.max() print "classes:", num_classes def predict(class_id): print "predicting: ", class_id salaries_idx = np.where(salaries_enc == class_id)