示例#1
0
import os

tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio = DataIO("Settings.json")

vectorizer = TfidfVectorizer(max_features=200,
                             norm='l1',
                             smooth_idf=True,
                             sublinear_tf=False,
                             use_idf=True)
short_id = "tfidf_200f_l1"
type_n = "train"
type_v = "valid"
dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid")
columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns, extra_features)
validation_features = dio.join_features(
    "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns,
    extra_valid_features)

print features.shape
print validation_features.shape
run = raw_input("OK (Y/N)?")
print run
if run != "Y":
    os.exit()
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"


vectorizer = CountVectorizer(
    max_features=200,
)
short_id = "count_200f"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
#dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)


columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)

#features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200",
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features).astype(np.int64)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)
else:
    type_n = "train"
    type_v = "valid"


vectorizer = CountVectorizer(
    max_features=200,
)
#short_id = "tfidf_200f_l1"
short_id = "count_200f"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)


columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_and_test")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
#features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200",
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)
print "features", features.shape
    type_n = "train"
    type_v = "valid"

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    max_df=0.5,
    stop_words='english'
)
#short_id = "tfidf_200f_l1"
short_id = "tfidf_05df_stopwords"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)


columns = ["Category", "ContractTime", "ContractType", "Company", "SourceName"]
le_features = dio.get_le_features(columns, "train_and_valid")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)

features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features, sparse=True)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features, sparse=True)

print features.shape
print validation_features.shape

salaries = dio.get_salaries(type_n, log=True)
if not submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
#short_id = "tfidf_200f_l1"
short_id = "tfidf_05df_stopwords"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)

columns = ["Category", "ContractTime", "ContractType", "Company", "SourceName"]
le_features = dio.get_le_features(columns, "train_and_valid")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)

features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features,
                             sparse=True)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id +
                                        "_matrix",
                                        tfidf_columns,
                                        extra_valid_features,
                                        sparse=True)

print features.shape
print validation_features.shape