tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio = DataIO("Settings.json") vectorizer = TfidfVectorizer(max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) short_id = "tfidf_200f_l1" type_n = "train" type_v = "valid" dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid") columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features( "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) print features.shape print validation_features.shape run = raw_input("OK (Y/N)?") print run if run != "Y": os.exit() files = joblib.dump(features,
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)), extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus) print type(title_corpus_pca)
else: type_n = "train" type_v = "valid" vectorizer = CountVectorizer( max_features=200, ) short_id = "count_200f" tfidf_columns = ["Title", "FullDescription", "LocationRaw"] #dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) #features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features).astype(np.int64) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features)
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus)