示例#1
0
def tfidf_cloud(n_trees):
    dio = DataIO("/data/Settings_cloud.json")
    submission = False
    min_samples_split = 2
    param = """Normal count vector with max 200. New submission which is repeatable.
    and nicer

    count_vector_titles = TfidfVectorizer(
        read_column(train_filename, column_name),
        max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True)
    """

    if submission:
        type_n = "train_full"
        type_v = "valid_full"
    else:
        type_n = "train"
        type_v = "valid"

#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features)

    def load(filename):
        return joblib.load(path_join("/data", filename))

    features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl")
    validation_features = load(
        "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl")

    print "features", features.shape
    print "valid features", validation_features.shape

    #salaries = dio.get_salaries(type_n, log=True)
    #if not submission:
    #valid_salaries = dio.get_salaries(type_v, log=True)

    #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries)
    #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries)

    #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5)
    #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5)

    #TODO: valid salaries so narobe dumpane

    salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl")
    valid_salaries = load(
        "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl")
    dio.is_log = True

    print salaries.shape

    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (
        min_samples_split, n_trees)
    print name
    #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno")
    classifier = ExtraTreesRegressor(
        n_estimators=n_trees,
        verbose=2,
        n_jobs=4,  # 2 jobs on submission / 4 on valid test
        oob_score=False,
        min_samples_split=min_samples_split,
        random_state=3465343)

    #dio.save_model(classifier, "testni_model", 99.)
    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
    #if bin_n > 4 :
        #make_grid_search(MultinomialNB(), {"alpha": [0.01, 0.1, 0.5, 1]}, "multinomialnb" + nm, "Multinomial NB" + par)
        #make_grid_search(SGDClassifier(), {'n_iter': [50, 100, 150], 'penalty': ['l2', 'l1']}, "sgd_class" + nm, "SGDClassifier classes" + par)
    ##make_grid_search(KNeighborsClassifier(), {'n_neighbors': range(4,100,20)}, "kneighbour" + nm, "Kneighbour classes" + par)
    #make_grid_search(RandomForestClassifier(random_state=42), {'min_samples_split': [2, 30]}, "randomForest" + nm, "Random Forest" + par)
    ##make_grid_search(GradientBoostingClassifier(), {'learning_rate': [0.1, 0.5], 'subsample': [1,0.8,0.6], 'n_estimators':[100,150]}, "GBM" + nm, "Gradient Boosting Machines " + par)

bin_n = 4
salaries_enc = encode_salaries(salaries, bin_n)
valid_salaries_enc = encode_salaries(valid_salaries, bin_n)
nm = "_tfidf_titleFullLoc_bin%d" % bin_n
model_name = "randomForest" + nm
par = " classed from 0-11500 then %d classes to 100 000 and to end\n Tfidf of Title full and location Raw" % (bin_n)

classifier = RandomForestClassifier(min_samples_split=2, random_state=42)
classifier.fit(features, salaries_enc)
prediction = classifier.predict(validation_features)
dio.save_prediction(model_name, prediction, "valid_classes")
dio.save_model(classifier, model_name, parameters=par)

print (classification_report(valid_salaries_enc, prediction))

print confusion_matrix(valid_salaries_enc, prediction)

prediction = classifier.predict(features)
dio.save_prediction(model_name, prediction, "train_classes")

print (classification_report(salaries_enc, prediction))

print confusion_matrix(salaries_enc, prediction)
            ppara = "Curr MAE: %0.2f Best MAE: %0.2f %s\n" % (
                mae_pred, mae_best_pred, isbetter)
            print ppara
            param += ppara

    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(ExtraTreesRegressor(n_estimators=n_trees,
                                           min_samples_split=min_samples_split,
                                           random_state=3465343),
                       name,
                       mae,
                       parameters=param)
        dio.save_prediction(name, predictions, type_n=type_v)
#oob_predictions = classifier.oob_prediction_
#mae_oob = mean_absolute_error(salaries, oob_predictions)
#print "MAE OOB: ", mae_oob
#classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
#verbose=1,
#n_jobs=3,
#oob_score=False,
#min_samples_split=min_samples_split,
#random_state=3465343)
#scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1)
#print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
#mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
示例#4
0
    min_samples_split, n_trees)
classifier = ExtraTreesRegressor(
    n_estimators=n_trees,
    #classifier = RandomForestRegressor(n_estimators=n_trees,
    verbose=2,
    n_jobs=4,  # 2 jobs on submission / 4 on valid test
    oob_score=True,
    min_samples_split=min_samples_split,
    random_state=3465343)
classifier.fit(features, salaries)
#classifier = dio.load_model(name)
#predictions = classifier.predict(validation_features)
metric = dio.error_metric
mae = metric(valid_salaries, predictions)
print "MAE validation: ", mae
dio.save_model(classifier, name, mae, parameters=param)
dio.save_prediction(name, predictions, type_n=type_v)
importances = classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in classifier.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
f_names = map(lambda x: "Title(%d)" % x, range(1, 201))
f_names.extend(map(lambda x: "Desc(%d)" % x, range(1, 201)))
f_names.extend(map(lambda x: "LocR(%d)" % x, range(1, 201)))
f_names.extend(columns)

num_feat = len(f_names)

# Print the feature ranking
print "Feature ranking:"
示例#5
0
min_samples_split=2
name = "ExtraTrees_min_sample%d_%dtrees_tfidf-05d_BoW-titleFullRaw-AllColumns_new_log" % (min_samples_split, n_trees)
classifier = ExtraTreesRegressor(n_estimators=n_trees,
#classifier = RandomForestRegressor(n_estimators=n_trees,
                                verbose=2,
                                n_jobs=4, # 2 jobs on submission / 4 on valid test
                                oob_score=True,
                                min_samples_split=min_samples_split,
                                random_state=3465343)
classifier.fit(features, salaries)
#classifier = dio.load_model(name)
#predictions = classifier.predict(validation_features)
metric = dio.error_metric
mae = metric(valid_salaries, predictions)
print "MAE validation: ", mae
dio.save_model(classifier, name, mae, parameters=param)
dio.save_prediction(name, predictions, type_n=type_v)
importances = classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in classifier.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
f_names = map(lambda x: "Title(%d)" % x,range(1,201))
f_names.extend(map(lambda x: "Desc(%d)" % x,range(1,201)))
f_names.extend(map(lambda x: "LocR(%d)" % x,range(1,201)))
f_names.extend(columns)

num_feat = len(f_names)

# Print the feature ranking
print "Feature ranking:"
        return predictions_part, valid_idx
    predictions = np.zeros_like(valid_salaries)
    for cur_class_id in range(num_classes + 1):
        predictions_part, idx = predict(cur_class_id)
        if idx is not None:
            predictions[idx] = predictions_part
            print "Part MAE: ", metric(valid_salaries[idx], predictions_part)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(ExtraTreesRegressor(), name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
#oob_predictions = classifier.oob_prediction_
#mae_oob = mean_absolute_error(salaries, oob_predictions)
#print "MAE OOB: ", mae_oob
        #classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
                                            #verbose=1,
                                            #n_jobs=3,
                                            #oob_score=False,
                                            #min_samples_split=min_samples_split,
                                            #random_state=3465343)
        #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1)
        #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
        #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
        #dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
#make_grid_search(MultinomialNB(), {"alpha": [0.01, 0.1, 0.5, 1]}, "multinomialnb" + nm, "Multinomial NB" + par)
#make_grid_search(SGDClassifier(), {'n_iter': [50, 100, 150], 'penalty': ['l2', 'l1']}, "sgd_class" + nm, "SGDClassifier classes" + par)
##make_grid_search(KNeighborsClassifier(), {'n_neighbors': range(4,100,20)}, "kneighbour" + nm, "Kneighbour classes" + par)
#make_grid_search(RandomForestClassifier(random_state=42), {'min_samples_split': [2, 30]}, "randomForest" + nm, "Random Forest" + par)
##make_grid_search(GradientBoostingClassifier(), {'learning_rate': [0.1, 0.5], 'subsample': [1,0.8,0.6], 'n_estimators':[100,150]}, "GBM" + nm, "Gradient Boosting Machines " + par)

bin_n = 4
salaries_enc = encode_salaries(salaries, bin_n)
valid_salaries_enc = encode_salaries(valid_salaries, bin_n)
nm = "_tfidf_titleFullLoc_bin%d" % bin_n
model_name = "randomForest" + nm
par = " classed from 0-11500 then %d classes to 100 000 and to end\n Tfidf of Title full and location Raw" % (
    bin_n)

classifier = RandomForestClassifier(min_samples_split=2, random_state=42)
classifier.fit(features, salaries_enc)
prediction = classifier.predict(validation_features)
dio.save_prediction(model_name, prediction, "valid_classes")
dio.save_model(classifier, model_name, parameters=par)

print(classification_report(valid_salaries_enc, prediction))

print confusion_matrix(valid_salaries_enc, prediction)

prediction = classifier.predict(features)
dio.save_prediction(model_name, prediction, "train_classes")

print(classification_report(salaries_enc, prediction))

print confusion_matrix(salaries_enc, prediction)
def tfidf_cloud(n_trees):
    dio = DataIO("/data/Settings_cloud.json")
    submission = False
    min_samples_split = 2
    param = """Normal count vector with max 200. New submission which is repeatable.
    and nicer

    count_vector_titles = TfidfVectorizer(
        read_column(train_filename, column_name),
        max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True)
    """

    if submission:
        type_n = "train_full"
        type_v = "valid_full"
    else:
        type_n = "train"
        type_v = "valid"



#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
                                #["Title", "FullDescription", "LocationRaw"],
                                #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
                                            #["Title", "FullDescription", "LocationRaw"],
                                            #extra_valid_features)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features)
    def load(filename):
        return joblib.load(path_join("/data", filename))

    features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl")
    validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl")

    print "features", features.shape
    print "valid features", validation_features.shape

#salaries = dio.get_salaries(type_n, log=True)
#if not submission:
        #valid_salaries = dio.get_salaries(type_v, log=True)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries)

#joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5)
#joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5)

#TODO: valid salaries so narobe dumpane

    salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl")
    valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl")
    dio.is_log = True

    print salaries.shape


    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees)
    print name
    #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno")
    classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                    verbose=2,
                                    n_jobs=4, # 2 jobs on submission / 4 on valid test
                                    oob_score=False,
                                    min_samples_split=min_samples_split,
                                    random_state=3465343)

    #dio.save_model(classifier, "testni_model", 99.)
    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
                isbetter = "DA"
            else:
                isbetter = "nope"
            ppara = "Curr MAE: %0.2f Best MAE: %0.2f %s\n" % (mae_pred, mae_best_pred, isbetter)
            print ppara
            param += ppara

    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(ExtraTreesRegressor(n_estimators=n_trees, min_samples_split=min_samples_split, random_state=3465343), name, mae, parameters=param)
        dio.save_prediction(name, predictions, type_n=type_v)
#oob_predictions = classifier.oob_prediction_
#mae_oob = mean_absolute_error(salaries, oob_predictions)
#print "MAE OOB: ", mae_oob
        #classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
                                            #verbose=1,
                                            #n_jobs=3,
                                            #oob_score=False,
                                            #min_samples_split=min_samples_split,
                                            #random_state=3465343)
        #scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1)
        #print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
        #mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
        #dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
示例#10
0
    predictions = np.zeros_like(valid_salaries)
    for cur_class_id in range(num_classes + 1):
        predictions_part, idx = predict(cur_class_id)
        if idx is not None:
            predictions[idx] = predictions_part
            print "Part MAE: ", metric(valid_salaries[idx], predictions_part)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(ExtraTreesRegressor(), name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
#oob_predictions = classifier.oob_prediction_
#mae_oob = mean_absolute_error(salaries, oob_predictions)
#print "MAE OOB: ", mae_oob
#classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
#verbose=1,
#n_jobs=3,
#oob_score=False,
#min_samples_split=min_samples_split,
#random_state=3465343)
#scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1)
#print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
#mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
#dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
示例#11
0
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(locraw_corpus_csc, salaries)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    best_estimator = pipeline.set_params(**best_parameters)
    dio.save_model(best_estimator,
                   model_name,
                   mae_cv=grid_search.best_score_,
                   parameters="GridCV")
    print grid_search.cv_scores_
#title
#[CVScoreTuple(parameters={'pca__n_components': 50}, mean_validation_score=8114.354436894354, cv_validation_scores=array([ 7832.02927486,  8142.59464648,  8368.43938934])), CVScoreTuple(parameters={'pca__n_components': 100}, mean_validation_score=8050.372232184578, cv_validation_scores=array([ 7805.2436147 ,  8050.13238353,  8295.74069832])), CVScoreTuple(parameters={'pca__n_components': 150}, mean_validation_score=7992.0067620542031, cv_validation_scores=array([ 7713.16260219,  8011.95811042,  8250.89957355])), CVScoreTuple(parameters={'pca__n_components': 200}, mean_validation_score=7969.954716946886, cv_validation_scores=array([ 7686.53479712,  7994.52939814,  8228.79995559])), CVScoreTuple(parameters={'pca__n_components': 250}, mean_validation_score=7951.4574455230977, cv_validation_scores=array([ 7667.32401519,  7986.80421311,  8200.24410827])), CVScoreTuple(parameters={'pca__n_components': 300}, mean_validation_score=7944.4481204410049, cv_validation_scores=array([ 7679.24885978,  7956.03557116,  8198.05993038])), CVScoreTuple(parameters={'pca__n_components': 350}, mean_validation_score=7944.1183320537721, cv_validation_scores=array([ 7676.60423802,  7978.78605684,  8176.9647013 ])), CVScoreTuple(parameters={'pca__n_components': 400}, mean_validation_score=7899.1156367965059, cv_validation_scores=array([ 7645.1559318 ,  7910.82353156,  8141.36744703])), CVScoreTuple(parameters={'pca__n_components': 450}, mean_validation_score=7912.5157328026426, cv_validation_scores=array([ 7640.30124334,  7911.32395596,  8185.92199911])), CVScoreTuple(parameters={'pca__n_components': 500}, mean_validation_score=7895.2892790734322, cv_validation_scores=array([ 7605.04498066,  7907.65668509,  8173.16617146]))]
#Best score: 7895.289
#Best parameters set:
#pca__n_components: 500

#description
#Best score: 9004.720
#Best parameters set:
#pca__n_components: 200
#[CVScoreTuple(parameters={'pca__n_components': 100}, mean_validation_score=9030.5959214303093, cv_validation_scores=array([ 8729.14388367,  9085.7485522 ,  9276.89532842])), CVScoreTuple(parameters={'pca__n_components': 200}, mean_validation_score=9004.7195356220382, cv_validation_scores=array([ 8712.73164292,  9081.87764045,  9219.54932349])), CVScoreTuple(parameters={'pca__n_components': 300}, mean_validation_score=9064.9514615674289, cv_validation_scores=array([ 8783.89079244,  9134.31360085,  9276.64999141])), CVScoreTuple(parameters={'pca__n_components': 400}, mean_validation_score=9076.2734308900635, cv_validation_scores=array([ 8776.89542476,  9160.65547469,  9291.26939321])), CVScoreTuple(parameters={'pca__n_components': 500}, mean_validation_score=9135.1105593660341, cv_validation_scores=array([ 8850.89803432,  9207.27553593,  9347.15810784])), CVScoreTuple(parameters={'pca__n_components': 600}, mean_validation_score=9124.6361021371431, cv_validation_scores=array([ 8837.96896581,  9150.07743191,  9385.8619087 ]))]

#locraw