def log_mean_absolute_error(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) paths = get_paths("Settings_loc5.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") names = ["Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName"] le_features = map(lambda x: label_encode_column_fit_only( x, file_id="train_full_data_path", type_n="train_full"), names) features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="train_data_path", type_n="train"), zip(le_features, names)) description_length = map(len, read_column(paths["train_data_path"], "FullDescription")) title_length = map(len, read_column(paths["train_data_path"], "Title")) features.append(description_length) features.append(title_length) #le_features, features = zip(*features_les) validation_features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="valid_data_path", type_n="valid"), zip(le_features, names)) description_length = map(len, read_column(paths["valid_data_path"], "FullDescription")) title_length = map(len, read_column(paths["valid_data_path"], "Title"))
from sklearn.cross_validation import cross_val_score from sklearn.ensemble import AdaBoostRegressor import joblib def log_mean_absolute_error(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) paths = get_paths("Settings.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") le_category, category_train = label_encode_column_fit("Category") category_valid = label_encode_column_transform(le_category, "Category") le_contractTime, contractTime_train = label_encode_column_fit("ContractTime") contractTime_valid = label_encode_column_transform(le_contractTime, "ContractTime") le_contractType, contractType_train = label_encode_column_fit("ContractType") contractType_valid = label_encode_column_transform(le_contractType, "ContractType") features = join_features("%strain_count_vector_matrix_max_f_100", #train_tfidf_matrix_max_f_200 ["Title", "FullDescription", "LocationRaw"], data_dir, [contractTime_train, contractType_train, category_train]) validation_features = join_features("%svalid_count_vector_matrix_max_f_100",#valid_tfidf_matrix_max_f_200 ["Title", "FullDescription", "LocationRaw"], data_dir,
from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error from sklearn.feature_extraction.text import CountVectorizer import joblib def log_mean_absolute_error(y_true, y_pred): return mean_absolute_error(np.exp(y_true), np.exp(y_pred)) paths = get_paths("Settings_submission.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") le_category, category_train = label_encode_column_fit("Category") category_valid = label_encode_column_transform(le_category, "Category") le_contractTime, contractTime_train = label_encode_column_fit("ContractTime") contractTime_valid = label_encode_column_transform(le_contractTime, "ContractTime") le_contractType, contractType_train = label_encode_column_fit("ContractType") contractType_valid = label_encode_column_transform(le_contractType, "ContractType") features = join_features("%s_train_full_count_vector_matrix_max_f_200", #train_tfidf_matrix_max_f_200 #features = join_features("%s_train_tfidf_matrix_max_f_200", ["Title", "FullDescription", "LocationRaw"], data_dir, [contractTime_train, contractType_train, category_train]) #for column_name in ["Title", "FullDescription", "LocationRaw"]: #vocabulary = joblib.load(path_join(cache_dir, column_name + "count_vectorizer_vocabulary")) #stop_words = joblib.load(path_join(cache_dir, column_name + "count_vectorizer_stop_words"))
paths = get_paths("Settings_loc5.json") data_dir = paths["data_path"] cache_dir = path_join(data_dir, "tmp") prediction_dir = path_join(data_dir, "predictions") names = [ "Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName" ] le_features = map( lambda x: label_encode_column_fit_only( x, file_id="train_full_data_path", type_n="train_full"), names) features = map( lambda (le, name): label_encode_column_transform( le, name, file_id="train_data_path", type_n="train"), zip(le_features, names)) description_length = map( len, read_column(paths["train_data_path"], "FullDescription")) title_length = map(len, read_column(paths["train_data_path"], "Title")) features.append(description_length) features.append(title_length) #le_features, features = zip(*features_les) validation_features = map( lambda (le, name): label_encode_column_transform( le, name, file_id="valid_data_path", type_n="valid"), zip(le_features, names))