示例#1
0
from os.path import join as path_join
from sklearn.feature_extraction.text import CountVectorizer
import joblib


paths = get_paths("Settings.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
# mem = Memory(cachedir=path_join(data_dir, "tmp"))
train_filename = paths["train_data_path"]
valid_filename = paths["valid_data_path"]
# count_vectorizer = mem.cache(CountVectorizer)

# creates word matrix for some columns
for column_name in ["Title", "FullDescription", "LocationRaw", "LocationNormalized"]:
    count_vector_titles = CountVectorizer(read_column(train_filename, column_name), max_features=200)
    titles = count_vector_titles.fit_transform(read_column(train_filename, column_name))
    joblib.dump(count_vector_titles.vocabulary_, path_join(cache_dir, column_name + "count_vectorizer_vocabulary"))
    joblib.dump(count_vector_titles.stop_words_, path_join(cache_dir, column_name + "count_vectorizer_stop_words"))
    print joblib.dump(titles, path_join(cache_dir, column_name + "_train_count_vector_matrix_max_f_200"))
    titles_valid = count_vector_titles.transform(read_column(valid_filename, column_name))
    print joblib.dump(titles_valid, path_join(cache_dir, column_name + "_valid_count_vector_matrix_max_f_200"))

# print titles

# counter = 0
# from collections import Counter
# times = Counter()
# for line in read_column(train_filename, "Category"):
# times[line.lower().strip()]+=1
# print times.most_common(10)
示例#2
0
from sklearn.feature_extraction.text import CountVectorizer
import joblib

paths = get_paths("Settings.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
#mem = Memory(cachedir=path_join(data_dir, "tmp"))
train_filename = paths["train_data_path"]
valid_filename = paths["valid_data_path"]
#count_vectorizer = mem.cache(CountVectorizer)

#creates word matrix for some columns
for column_name in [
        "Title", "FullDescription", "LocationRaw", "LocationNormalized"
]:
    count_vector_titles = CountVectorizer(read_column(train_filename,
                                                      column_name),
                                          max_features=200)
    titles = count_vector_titles.fit_transform(
        read_column(train_filename, column_name))
    joblib.dump(
        count_vector_titles.vocabulary_,
        path_join(cache_dir, column_name + "count_vectorizer_vocabulary"))
    joblib.dump(
        count_vector_titles.stop_words_,
        path_join(cache_dir, column_name + "count_vectorizer_stop_words"))
    print joblib.dump(
        titles,
        path_join(cache_dir,
                  column_name + "_train_count_vector_matrix_max_f_200"))
    titles_valid = count_vector_titles.transform(
        read_column(valid_filename, column_name))
示例#3
0
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


paths = get_paths("Settings_loc5.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")


names = ["Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName"]
le_features = map(lambda x: label_encode_column_fit_only(
    x, file_id="train_full_data_path", type_n="train_full"), names)

features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="train_data_path", type_n="train"), zip(le_features, names))

description_length = map(len, read_column(paths["train_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["train_data_path"], "Title"))

features.append(description_length)
features.append(title_length)


#le_features, features = zip(*features_les)

validation_features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="valid_data_path", type_n="valid"), zip(le_features, names))

description_length = map(len, read_column(paths["valid_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["valid_data_path"], "Title"))

validation_features.append(description_length)
validation_features.append(title_length)
    #stop_words = joblib.load(path_join(cache_dir, column_name + "count_vectorizer_stop_words"))

    #count_vector_titles = CountVectorizer(max_features=200, vocabulary=vocabulary, stop_words=stop_words)
    #titles_valid = count_vector_titles.transform(
        #read_column(paths["test_data_path"], column_name))
    #print joblib.dump(titles_valid, path_join(cache_dir, column_name + "_test_count_vector_matrix_max_f_200"))
validation_features = join_features("%s_valid_full_count_vector_matrix_max_f_200",#valid_tfidf_matrix_max_f_200
#validation_features = join_features("%s_valid_tfidf_matrix_max_f_200",
                                    ["Title", "FullDescription", "LocationRaw"],
                                    data_dir,
                                    [contractTime_valid, contractType_valid, category_valid])
print "features", features.shape
print "valid features", validation_features.shape


salaries = np.array(list(read_column(paths["train_data_path"], "SalaryNormalized"))).astype(np.float64)
#valid_salaries = np.array(list(read_column(paths["valid_data_path"], "SalaryNormalized"))).astype(np.float64)
salaries = np.log(salaries)
print salaries.shape
#valid_salaries = np.log(valid_salaries)
#print valid_salaries.shape

model1 = "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log"
model2 = "vowpall_submission"
model3 = "Random_forest_min_sample2_20trees_200f_noNorm_categoryTimeType_log"
model4 = "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log"
model5 = "Random_forest_min_sample2_40trees_200f_noNorm_categoryTimeType_log"
model6 = "vowpall_loc5"
#model_names = [model2, model4]
model_names = [model1, model6, model4]
示例#5
0
contractType_valid = label_encode_column_transform(le_contractType, "ContractType")


features = join_features("%strain_count_vector_matrix_max_f_100", #train_tfidf_matrix_max_f_200
                         ["Title", "FullDescription", "LocationRaw"],
                         data_dir,
                         [contractTime_train, contractType_train, category_train])
validation_features = join_features("%svalid_count_vector_matrix_max_f_100",#valid_tfidf_matrix_max_f_200
                                    ["Title", "FullDescription", "LocationRaw"],
                                    data_dir,
                                    [contractTime_valid, contractType_valid, category_valid])
print "features", features.shape
print "valid features", validation_features.shape


salaries = np.array(list(read_column(paths["train_data_path"], "SalaryNormalized"))).astype(np.float64)
valid_salaries = np.array(list(read_column(paths["valid_data_path"], "SalaryNormalized"))).astype(np.float64)
salaries = np.log(salaries)
print salaries.shape
#classifier = RandomForestRegressor(n_estimators=10,
                                   #verbose=2,
                                   #n_jobs=1,
                                   #oob_score=True,
                                  #min_samples_split=30,
                                   #random_state=3465343)
for n_trees in range(10,11,10):
    for min_samples_split in [2, 30]:
        print n_trees
        #name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_log" % (min_samples_split, n_trees)
        name = "adaBoost_ExtraTree-2-10tr_%dtrees_100f_noNorm_categoryTimeType_log" % (n_trees)
        print name
示例#6
0
 def getstream(self):
     logger.info("getting stream")
     reader = read_column(self.input, self.column)
     return reader
#count_vector_titles = CountVectorizer(max_features=200, vocabulary=vocabulary, stop_words=stop_words)
#titles_valid = count_vector_titles.transform(
#read_column(paths["test_data_path"], column_name))
#print joblib.dump(titles_valid, path_join(cache_dir, column_name + "_test_count_vector_matrix_max_f_200"))
validation_features = join_features(
    "%s_valid_full_count_vector_matrix_max_f_200",  #valid_tfidf_matrix_max_f_200
    #validation_features = join_features("%s_valid_tfidf_matrix_max_f_200",
    ["Title", "FullDescription", "LocationRaw"],
    data_dir,
    [contractTime_valid, contractType_valid, category_valid])
print "features", features.shape
print "valid features", validation_features.shape

salaries = np.array(
    list(read_column(paths["train_data_path"],
                     "SalaryNormalized"))).astype(np.float64)
#valid_salaries = np.array(list(read_column(paths["valid_data_path"], "SalaryNormalized"))).astype(np.float64)
salaries = np.log(salaries)
print salaries.shape
#valid_salaries = np.log(valid_salaries)
#print valid_salaries.shape

model1 = "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log"
model2 = "vowpall_submission"
model3 = "Random_forest_min_sample2_20trees_200f_noNorm_categoryTimeType_log"
model4 = "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log"
model5 = "Random_forest_min_sample2_40trees_200f_noNorm_categoryTimeType_log"
model6 = "vowpall_loc5"
#model_names = [model2, model4]
model_names = [model1, model6, model4]
示例#8
0
names = [
    "Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4",
    "Loc5", "Company", "SourceName"
]
le_features = map(
    lambda x: label_encode_column_fit_only(
        x, file_id="train_full_data_path", type_n="train_full"), names)

features = map(
    lambda (le, name): label_encode_column_transform(
        le, name, file_id="train_data_path", type_n="train"),
    zip(le_features, names))

description_length = map(
    len, read_column(paths["train_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["train_data_path"], "Title"))

features.append(description_length)
features.append(title_length)

#le_features, features = zip(*features_les)

validation_features = map(
    lambda (le, name): label_encode_column_transform(
        le, name, file_id="valid_data_path", type_n="valid"),
    zip(le_features, names))

description_length = map(
    len, read_column(paths["valid_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["valid_data_path"], "Title"))