#le_features = dio.get_le_features(columns, "train_full")
#extra_features = dio.get_features(columns, type_n, le_features)
#extra_valid_features = dio.get_features(columns, type_v, le_features)

#features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200",
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features).astype(np.int64)
short_id = "tfidf_200f_l2"
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)
extra_features = []
extra_valid_features = []
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)

print features.max()
print features.min()
salaries = dio.get_salaries(type_n, log=False).astype(np.int64)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=False)

def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    print np.diff(bin_edges)
Exemplo n.º 2
0
dio = DataIO("Settings.json")

vectorizer = TfidfVectorizer(max_features=200,
                             norm='l1',
                             smooth_idf=True,
                             sublinear_tf=False,
                             use_idf=True)
short_id = "tfidf_200f_l1"
type_n = "train"
type_v = "valid"
dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid")
columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns, extra_features)
validation_features = dio.join_features(
    "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns,
    extra_valid_features)

print features.shape
print validation_features.shape
run = raw_input("OK (Y/N)?")
print run
if run != "Y":
    os.exit()

files = joblib.dump(features,
                    "train_200f_noNorm_categoryTimeType_tfidfl1_features_jl",
                    compress=9)
files1 = joblib.dump(