from os.path import join as path_join from sklearn.feature_extraction.text import CountVectorizer import joblib paths = get_paths("Settings.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") # mem = Memory(cachedir=path_join(data_dir, "tmp")) train_filename = paths["train_data_path"] valid_filename = paths["valid_data_path"] # count_vectorizer = mem.cache(CountVectorizer) # creates word matrix for some columns for column_name in ["Title", "FullDescription", "LocationRaw", "LocationNormalized"]: count_vector_titles = CountVectorizer(read_column(train_filename, column_name), max_features=200) titles = count_vector_titles.fit_transform(read_column(train_filename, column_name)) joblib.dump(count_vector_titles.vocabulary_, path_join(cache_dir, column_name + "count_vectorizer_vocabulary")) joblib.dump(count_vector_titles.stop_words_, path_join(cache_dir, column_name + "count_vectorizer_stop_words")) print joblib.dump(titles, path_join(cache_dir, column_name + "_train_count_vector_matrix_max_f_200")) titles_valid = count_vector_titles.transform(read_column(valid_filename, column_name)) print joblib.dump(titles_valid, path_join(cache_dir, column_name + "_valid_count_vector_matrix_max_f_200")) # print titles # counter = 0 # from collections import Counter # times = Counter() # for line in read_column(train_filename, "Category"): # times[line.lower().strip()]+=1 # print times.most_common(10)
from sklearn.feature_extraction.text import CountVectorizer import joblib paths = get_paths("Settings.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") #mem = Memory(cachedir=path_join(data_dir, "tmp")) train_filename = paths["train_data_path"] valid_filename = paths["valid_data_path"] #count_vectorizer = mem.cache(CountVectorizer) #creates word matrix for some columns for column_name in [ "Title", "FullDescription", "LocationRaw", "LocationNormalized" ]: count_vector_titles = CountVectorizer(read_column(train_filename, column_name), max_features=200) titles = count_vector_titles.fit_transform( read_column(train_filename, column_name)) joblib.dump( count_vector_titles.vocabulary_, path_join(cache_dir, column_name + "count_vectorizer_vocabulary")) joblib.dump( count_vector_titles.stop_words_, path_join(cache_dir, column_name + "count_vectorizer_stop_words")) print joblib.dump( titles, path_join(cache_dir, column_name + "_train_count_vector_matrix_max_f_200")) titles_valid = count_vector_titles.transform( read_column(valid_filename, column_name))
return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) paths = get_paths("Settings_loc5.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") names = ["Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName"] le_features = map(lambda x: label_encode_column_fit_only( x, file_id="train_full_data_path", type_n="train_full"), names) features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="train_data_path", type_n="train"), zip(le_features, names)) description_length = map(len, read_column(paths["train_data_path"], "FullDescription")) title_length = map(len, read_column(paths["train_data_path"], "Title")) features.append(description_length) features.append(title_length) #le_features, features = zip(*features_les) validation_features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="valid_data_path", type_n="valid"), zip(le_features, names)) description_length = map(len, read_column(paths["valid_data_path"], "FullDescription")) title_length = map(len, read_column(paths["valid_data_path"], "Title")) validation_features.append(description_length) validation_features.append(title_length)
#stop_words = joblib.load(path_join(cache_dir, column_name + "count_vectorizer_stop_words")) #count_vector_titles = CountVectorizer(max_features=200, vocabulary=vocabulary, stop_words=stop_words) #titles_valid = count_vector_titles.transform( #read_column(paths["test_data_path"], column_name)) #print joblib.dump(titles_valid, path_join(cache_dir, column_name + "_test_count_vector_matrix_max_f_200")) validation_features = join_features("%s_valid_full_count_vector_matrix_max_f_200",#valid_tfidf_matrix_max_f_200 #validation_features = join_features("%s_valid_tfidf_matrix_max_f_200", ["Title", "FullDescription", "LocationRaw"], data_dir, [contractTime_valid, contractType_valid, category_valid]) print "features", features.shape print "valid features", validation_features.shape salaries = np.array(list(read_column(paths["train_data_path"], "SalaryNormalized"))).astype(np.float64) #valid_salaries = np.array(list(read_column(paths["valid_data_path"], "SalaryNormalized"))).astype(np.float64) salaries = np.log(salaries) print salaries.shape #valid_salaries = np.log(valid_salaries) #print valid_salaries.shape model1 = "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log" model2 = "vowpall_submission" model3 = "Random_forest_min_sample2_20trees_200f_noNorm_categoryTimeType_log" model4 = "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log" model5 = "Random_forest_min_sample2_40trees_200f_noNorm_categoryTimeType_log" model6 = "vowpall_loc5" #model_names = [model2, model4] model_names = [model1, model6, model4]
contractType_valid = label_encode_column_transform(le_contractType, "ContractType") features = join_features("%strain_count_vector_matrix_max_f_100", #train_tfidf_matrix_max_f_200 ["Title", "FullDescription", "LocationRaw"], data_dir, [contractTime_train, contractType_train, category_train]) validation_features = join_features("%svalid_count_vector_matrix_max_f_100",#valid_tfidf_matrix_max_f_200 ["Title", "FullDescription", "LocationRaw"], data_dir, [contractTime_valid, contractType_valid, category_valid]) print "features", features.shape print "valid features", validation_features.shape salaries = np.array(list(read_column(paths["train_data_path"], "SalaryNormalized"))).astype(np.float64) valid_salaries = np.array(list(read_column(paths["valid_data_path"], "SalaryNormalized"))).astype(np.float64) salaries = np.log(salaries) print salaries.shape #classifier = RandomForestRegressor(n_estimators=10, #verbose=2, #n_jobs=1, #oob_score=True, #min_samples_split=30, #random_state=3465343) for n_trees in range(10,11,10): for min_samples_split in [2, 30]: print n_trees #name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_log" % (min_samples_split, n_trees) name = "adaBoost_ExtraTree-2-10tr_%dtrees_100f_noNorm_categoryTimeType_log" % (n_trees) print name
def getstream(self): logger.info("getting stream") reader = read_column(self.input, self.column) return reader
#count_vector_titles = CountVectorizer(max_features=200, vocabulary=vocabulary, stop_words=stop_words) #titles_valid = count_vector_titles.transform( #read_column(paths["test_data_path"], column_name)) #print joblib.dump(titles_valid, path_join(cache_dir, column_name + "_test_count_vector_matrix_max_f_200")) validation_features = join_features( "%s_valid_full_count_vector_matrix_max_f_200", #valid_tfidf_matrix_max_f_200 #validation_features = join_features("%s_valid_tfidf_matrix_max_f_200", ["Title", "FullDescription", "LocationRaw"], data_dir, [contractTime_valid, contractType_valid, category_valid]) print "features", features.shape print "valid features", validation_features.shape salaries = np.array( list(read_column(paths["train_data_path"], "SalaryNormalized"))).astype(np.float64) #valid_salaries = np.array(list(read_column(paths["valid_data_path"], "SalaryNormalized"))).astype(np.float64) salaries = np.log(salaries) print salaries.shape #valid_salaries = np.log(valid_salaries) #print valid_salaries.shape model1 = "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log" model2 = "vowpall_submission" model3 = "Random_forest_min_sample2_20trees_200f_noNorm_categoryTimeType_log" model4 = "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log" model5 = "Random_forest_min_sample2_40trees_200f_noNorm_categoryTimeType_log" model6 = "vowpall_loc5" #model_names = [model2, model4] model_names = [model1, model6, model4]
names = [ "Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName" ] le_features = map( lambda x: label_encode_column_fit_only( x, file_id="train_full_data_path", type_n="train_full"), names) features = map( lambda (le, name): label_encode_column_transform( le, name, file_id="train_data_path", type_n="train"), zip(le_features, names)) description_length = map( len, read_column(paths["train_data_path"], "FullDescription")) title_length = map(len, read_column(paths["train_data_path"], "Title")) features.append(description_length) features.append(title_length) #le_features, features = zip(*features_les) validation_features = map( lambda (le, name): label_encode_column_transform( le, name, file_id="valid_data_path", type_n="valid"), zip(le_features, names)) description_length = map( len, read_column(paths["valid_data_path"], "FullDescription")) title_length = map(len, read_column(paths["valid_data_path"], "Title"))