示例#1
0
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features,
                             sparse=True)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id +
                                        "_matrix",
                                        tfidf_columns,
                                        extra_valid_features,
                                        sparse=True)

print features.shape
print validation_features.shape

salaries = dio.get_salaries(type_n, log=True)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)

print salaries.shape
bs = Bootstrap(len(salaries), random_state=45, train_size=0.6)
train_index, test_index = next(iter(bs))
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

Bag of Words: %s\n

Encoded cols: %s\n

Logged
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features).astype(np.int64)
short_id = "tfidf_200f_l2"
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)
extra_features = []
extra_valid_features = []
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)

print features.max()
print features.min()
salaries = dio.get_salaries(type_n, log=False).astype(np.int64)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=False)

def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    print np.diff(bin_edges)
    idxs = np.searchsorted(bin_edges, salaries, side="right")
    return idxs

#bin_edges.insert(0, 0)
#bin_edges.append(salaries.max() + 1)

#print "hist", hist
#print "edges", bin_edges
feature_category = extra_features[col_index]
validation_features_category = extra_valid_features[col_index]

#features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features).astype(np.int64)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns, extra_features)
validation_features = dio.join_features(
    "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns,
    extra_valid_features)

salaries = dio.get_salaries(type_n, log=True).astype(np.int64)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)
best_predictions = dio.get_prediction(
    model_name="ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log",
    type_n="valid")

par = " classed from 0-11500 then 4 classes to 100 000 and to end NoNormal classTypeTime"


def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    print np.diff(bin_edges)
    idxs = np.searchsorted(bin_edges, salaries, side="right")
    return idxs
示例#4
0
#columns = ["Category", "ContractTime", "ContractType"]
#le_features = dio.get_le_features(columns, "train_full")
#extra_features = dio.get_features(columns, type_n, le_features)
#extra_valid_features = dio.get_features(columns, type_v, le_features)
#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features)
#print "features", features.shape
#print "valid features", validation_features.shape

#salaries = dio.get_salaries(type_n, log=True)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=False)
print valid_salaries.shape

model_names = []
for n_trees in [20, 30, 40]:
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new_log" % (n_trees)
    model_names.append(name)
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new" % (n_trees)
    model_names.append(name)
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (n_trees)
    model_names.append(name)
model_names.append("vowpall")
model_names.append("vowpall_loc5")


#fit_predict(model2)
示例#5
0
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features, sparse=True)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features, sparse=True)

print features.shape
print validation_features.shape

salaries = dio.get_salaries(type_n, log=True)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)

print salaries.shape
bs = Bootstrap(len(salaries), random_state=45, train_size=0.6)
train_index, test_index = next(iter(bs))
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

Bag of Words: %s\n

Encoded cols: %s\n

Logged
from data_io import DataIO
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.base import clone
from sklearn.cross_validation import cross_val_score
import numpy as np

dio = DataIO("Settings.json")

title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
pca = RandomizedPCA(random_state=3465343)
salaries = dio.get_salaries("train", log=True)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, "train", le_features)
#extra_valid_features = dio.get_features(columns, "valid", le_features)

param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns)
print map(len, extra_features)
extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)),
                     extra_features)

print type(title_corpus)
print title_corpus.shape

title_pca = clone(pca)
title_pca.set_params(n_components=200)
title_corpus_pca = title_pca.fit_transform(title_corpus)

print type(title_corpus_pca)
示例#7
0
#columns = ["Category", "ContractTime", "ContractType"]
#le_features = dio.get_le_features(columns, "train_full")
#extra_features = dio.get_features(columns, type_n, le_features)
#extra_valid_features = dio.get_features(columns, type_v, le_features)
#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features)
#print "features", features.shape
#print "valid features", validation_features.shape

#salaries = dio.get_salaries(type_n, log=True)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=False)
print valid_salaries.shape

model_names = []
for n_trees in [20, 30, 40]:
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new_log" % (
        n_trees)
    model_names.append(name)
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new" % (
        n_trees)
    model_names.append(name)
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (
        n_trees)
    model_names.append(name)
model_names.append("vowpall")
model_names.append("vowpall_loc5")
from data_io import DataIO
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.base import clone
from sklearn.cross_validation import cross_val_score
import numpy as np

dio = DataIO("Settings.json")

title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
pca = RandomizedPCA(random_state=3465343)
salaries = dio.get_salaries("train", log=True)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, "train", le_features)
#extra_valid_features = dio.get_features(columns, "valid", le_features)

param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns)
print map(len, extra_features)
extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features)



print type(title_corpus)
print title_corpus.shape


title_pca = clone(pca)
title_pca.set_params(n_components=200)
title_corpus_pca = title_pca.fit_transform(title_corpus)
示例#9
0
model_names = [
    "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log",
    "vowpall",
    #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_14split_new_log",
    #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_split_new_log",
    #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_rf10_4split_new1_log"
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_rf10_4split_newOKsalPredictValid_log",
    "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_new_faked_log",
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newOKsalPredictValid_log",
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid_log",
    "Ridge_tfidf_05d_log"
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid1_log",
]

valid_salaries = dio.get_salaries("valid", log=False)

ylim = (0, 8000)
xlim = (-50000, 50000)
grid = True

def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    #bin_edges = list(bin_edges)
    #bin_edges.insert(0, 0)
    #bin_edges.append(salaries.max() + 1)
    print np.diff(bin_edges)
    idxs = np.searchsorted(bin_edges, salaries, side="right")
    return idxs, bin_edges