def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load( "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load( "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % ( min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor( n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
#if bin_n > 4 : #make_grid_search(MultinomialNB(), {"alpha": [0.01, 0.1, 0.5, 1]}, "multinomialnb" + nm, "Multinomial NB" + par) #make_grid_search(SGDClassifier(), {'n_iter': [50, 100, 150], 'penalty': ['l2', 'l1']}, "sgd_class" + nm, "SGDClassifier classes" + par) ##make_grid_search(KNeighborsClassifier(), {'n_neighbors': range(4,100,20)}, "kneighbour" + nm, "Kneighbour classes" + par) #make_grid_search(RandomForestClassifier(random_state=42), {'min_samples_split': [2, 30]}, "randomForest" + nm, "Random Forest" + par) ##make_grid_search(GradientBoostingClassifier(), {'learning_rate': [0.1, 0.5], 'subsample': [1,0.8,0.6], 'n_estimators':[100,150]}, "GBM" + nm, "Gradient Boosting Machines " + par) bin_n = 4 salaries_enc = encode_salaries(salaries, bin_n) valid_salaries_enc = encode_salaries(valid_salaries, bin_n) nm = "_tfidf_titleFullLoc_bin%d" % bin_n model_name = "randomForest" + nm par = " classed from 0-11500 then %d classes to 100 000 and to end\n Tfidf of Title full and location Raw" % (bin_n) classifier = RandomForestClassifier(min_samples_split=2, random_state=42) classifier.fit(features, salaries_enc) prediction = classifier.predict(validation_features) dio.save_prediction(model_name, prediction, "valid_classes") dio.save_model(classifier, model_name, parameters=par) print (classification_report(valid_salaries_enc, prediction)) print confusion_matrix(valid_salaries_enc, prediction) prediction = classifier.predict(features) dio.save_prediction(model_name, prediction, "train_classes") print (classification_report(salaries_enc, prediction)) print confusion_matrix(salaries_enc, prediction)
ppara = "Curr MAE: %0.2f Best MAE: %0.2f %s\n" % ( mae_pred, mae_best_pred, isbetter) print ppara param += ppara if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(n_estimators=n_trees, min_samples_split=min_samples_split, random_state=3465343), name, mae, parameters=param) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob #classifier1 = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=3, #oob_score=False, #min_samples_split=min_samples_split, #random_state=3465343) #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1) #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
min_samples_split, n_trees) classifier = ExtraTreesRegressor( n_estimators=n_trees, #classifier = RandomForestRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=True, min_samples_split=min_samples_split, random_state=3465343) classifier.fit(features, salaries) #classifier = dio.load_model(name) #predictions = classifier.predict(validation_features) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae, parameters=param) dio.save_prediction(name, predictions, type_n=type_v) importances = classifier.feature_importances_ std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0) indices = np.argsort(importances)[::-1] f_names = map(lambda x: "Title(%d)" % x, range(1, 201)) f_names.extend(map(lambda x: "Desc(%d)" % x, range(1, 201))) f_names.extend(map(lambda x: "LocR(%d)" % x, range(1, 201))) f_names.extend(columns) num_feat = len(f_names) # Print the feature ranking print "Feature ranking:"
min_samples_split=2 name = "ExtraTrees_min_sample%d_%dtrees_tfidf-05d_BoW-titleFullRaw-AllColumns_new_log" % (min_samples_split, n_trees) classifier = ExtraTreesRegressor(n_estimators=n_trees, #classifier = RandomForestRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=True, min_samples_split=min_samples_split, random_state=3465343) classifier.fit(features, salaries) #classifier = dio.load_model(name) #predictions = classifier.predict(validation_features) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae, parameters=param) dio.save_prediction(name, predictions, type_n=type_v) importances = classifier.feature_importances_ std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0) indices = np.argsort(importances)[::-1] f_names = map(lambda x: "Title(%d)" % x,range(1,201)) f_names.extend(map(lambda x: "Desc(%d)" % x,range(1,201))) f_names.extend(map(lambda x: "LocR(%d)" % x,range(1,201))) f_names.extend(columns) num_feat = len(f_names) # Print the feature ranking print "Feature ranking:"
return predictions_part, valid_idx predictions = np.zeros_like(valid_salaries) for cur_class_id in range(num_classes + 1): predictions_part, idx = predict(cur_class_id) if idx is not None: predictions[idx] = predictions_part print "Part MAE: ", metric(valid_salaries[idx], predictions_part) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(), name, mae) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob #classifier1 = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=3, #oob_score=False, #min_samples_split=min_samples_split, #random_state=3465343) #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1) #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
#make_grid_search(MultinomialNB(), {"alpha": [0.01, 0.1, 0.5, 1]}, "multinomialnb" + nm, "Multinomial NB" + par) #make_grid_search(SGDClassifier(), {'n_iter': [50, 100, 150], 'penalty': ['l2', 'l1']}, "sgd_class" + nm, "SGDClassifier classes" + par) ##make_grid_search(KNeighborsClassifier(), {'n_neighbors': range(4,100,20)}, "kneighbour" + nm, "Kneighbour classes" + par) #make_grid_search(RandomForestClassifier(random_state=42), {'min_samples_split': [2, 30]}, "randomForest" + nm, "Random Forest" + par) ##make_grid_search(GradientBoostingClassifier(), {'learning_rate': [0.1, 0.5], 'subsample': [1,0.8,0.6], 'n_estimators':[100,150]}, "GBM" + nm, "Gradient Boosting Machines " + par) bin_n = 4 salaries_enc = encode_salaries(salaries, bin_n) valid_salaries_enc = encode_salaries(valid_salaries, bin_n) nm = "_tfidf_titleFullLoc_bin%d" % bin_n model_name = "randomForest" + nm par = " classed from 0-11500 then %d classes to 100 000 and to end\n Tfidf of Title full and location Raw" % ( bin_n) classifier = RandomForestClassifier(min_samples_split=2, random_state=42) classifier.fit(features, salaries_enc) prediction = classifier.predict(validation_features) dio.save_prediction(model_name, prediction, "valid_classes") dio.save_model(classifier, model_name, parameters=par) print(classification_report(valid_salaries_enc, prediction)) print confusion_matrix(valid_salaries_enc, prediction) prediction = classifier.predict(features) dio.save_prediction(model_name, prediction, "train_classes") print(classification_report(salaries_enc, prediction)) print confusion_matrix(salaries_enc, prediction)
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
isbetter = "DA" else: isbetter = "nope" ppara = "Curr MAE: %0.2f Best MAE: %0.2f %s\n" % (mae_pred, mae_best_pred, isbetter) print ppara param += ppara if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(n_estimators=n_trees, min_samples_split=min_samples_split, random_state=3465343), name, mae, parameters=param) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob #classifier1 = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=3, #oob_score=False, #min_samples_split=min_samples_split, #random_state=3465343) #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1) #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
predictions = np.zeros_like(valid_salaries) for cur_class_id in range(num_classes + 1): predictions_part, idx = predict(cur_class_id) if idx is not None: predictions[idx] = predictions_part print "Part MAE: ", metric(valid_salaries[idx], predictions_part) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(), name, mae) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob #classifier1 = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=3, #oob_score=False, #min_samples_split=min_samples_split, #random_state=3465343) #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1) #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) #dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
print("parameters:") pprint(parameters) t0 = time() grid_search.fit(locraw_corpus_csc, salaries) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_params_ for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) best_estimator = pipeline.set_params(**best_parameters) dio.save_model(best_estimator, model_name, mae_cv=grid_search.best_score_, parameters="GridCV") print grid_search.cv_scores_ #title #[CVScoreTuple(parameters={'pca__n_components': 50}, mean_validation_score=8114.354436894354, cv_validation_scores=array([ 7832.02927486, 8142.59464648, 8368.43938934])), CVScoreTuple(parameters={'pca__n_components': 100}, mean_validation_score=8050.372232184578, cv_validation_scores=array([ 7805.2436147 , 8050.13238353, 8295.74069832])), CVScoreTuple(parameters={'pca__n_components': 150}, mean_validation_score=7992.0067620542031, cv_validation_scores=array([ 7713.16260219, 8011.95811042, 8250.89957355])), CVScoreTuple(parameters={'pca__n_components': 200}, mean_validation_score=7969.954716946886, cv_validation_scores=array([ 7686.53479712, 7994.52939814, 8228.79995559])), CVScoreTuple(parameters={'pca__n_components': 250}, mean_validation_score=7951.4574455230977, cv_validation_scores=array([ 7667.32401519, 7986.80421311, 8200.24410827])), CVScoreTuple(parameters={'pca__n_components': 300}, mean_validation_score=7944.4481204410049, cv_validation_scores=array([ 7679.24885978, 7956.03557116, 8198.05993038])), CVScoreTuple(parameters={'pca__n_components': 350}, mean_validation_score=7944.1183320537721, cv_validation_scores=array([ 7676.60423802, 7978.78605684, 8176.9647013 ])), CVScoreTuple(parameters={'pca__n_components': 400}, mean_validation_score=7899.1156367965059, cv_validation_scores=array([ 7645.1559318 , 7910.82353156, 8141.36744703])), CVScoreTuple(parameters={'pca__n_components': 450}, mean_validation_score=7912.5157328026426, cv_validation_scores=array([ 7640.30124334, 7911.32395596, 8185.92199911])), CVScoreTuple(parameters={'pca__n_components': 500}, mean_validation_score=7895.2892790734322, cv_validation_scores=array([ 7605.04498066, 7907.65668509, 8173.16617146]))] #Best score: 7895.289 #Best parameters set: #pca__n_components: 500 #description #Best score: 9004.720 #Best parameters set: #pca__n_components: 200 #[CVScoreTuple(parameters={'pca__n_components': 100}, mean_validation_score=9030.5959214303093, cv_validation_scores=array([ 8729.14388367, 9085.7485522 , 9276.89532842])), CVScoreTuple(parameters={'pca__n_components': 200}, mean_validation_score=9004.7195356220382, cv_validation_scores=array([ 8712.73164292, 9081.87764045, 9219.54932349])), CVScoreTuple(parameters={'pca__n_components': 300}, mean_validation_score=9064.9514615674289, cv_validation_scores=array([ 8783.89079244, 9134.31360085, 9276.64999141])), CVScoreTuple(parameters={'pca__n_components': 400}, mean_validation_score=9076.2734308900635, cv_validation_scores=array([ 8776.89542476, 9160.65547469, 9291.26939321])), CVScoreTuple(parameters={'pca__n_components': 500}, mean_validation_score=9135.1105593660341, cv_validation_scores=array([ 8850.89803432, 9207.27553593, 9347.15810784])), CVScoreTuple(parameters={'pca__n_components': 600}, mean_validation_score=9124.6361021371431, cv_validation_scores=array([ 8837.96896581, 9150.07743191, 9385.8619087 ]))] #locraw