from data_io import DataIO
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.base import clone
from sklearn.cross_validation import cross_val_score
import numpy as np

dio = DataIO("Settings.json")

title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
pca = RandomizedPCA(random_state=3465343)
salaries = dio.get_salaries("train", log=True)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, "train", le_features)
#extra_valid_features = dio.get_features(columns, "valid", le_features)

param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns)
print map(len, extra_features)
extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features)



print type(title_corpus)
print title_corpus.shape


title_pca = clone(pca)
title_pca.set_params(n_components=200)
title_corpus_pca = title_pca.fit_transform(title_corpus)
from data_io import DataIO
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.base import clone
from sklearn.cross_validation import cross_val_score
import numpy as np

dio = DataIO("Settings.json")

title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
pca = RandomizedPCA(random_state=3465343)
salaries = dio.get_salaries("train", log=True)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, "train", le_features)
#extra_valid_features = dio.get_features(columns, "valid", le_features)

param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns)
print map(len, extra_features)
extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)),
                     extra_features)

print type(title_corpus)
print title_corpus.shape

title_pca = clone(pca)
title_pca.set_params(n_components=200)
title_corpus_pca = title_pca.fit_transform(title_corpus)

print type(title_corpus_pca)
Exemplo n.º 3
0
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time
from sklearn.ensemble import ExtraTreesRegressor

dio = DataIO("Settings_loc5.json")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

salaries = dio.get_salaries("train", log=True)

#title_corpus_csc = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
#desc_corpus_csc = dio.read_gensim_corpus("train_desc_nltk_filtered.corpus.mtx")
locraw_corpus_csc = dio.read_gensim_corpus(
    "train_locraw_nltk_filtered.corpus.mtx")

#print title_corpus_csc.shape
print locraw_corpus_csc.shape

pipeline = Pipeline([
    ('pca', RandomizedPCA(random_state=3465343)),
    ('trees',
     ExtraTreesRegressor(min_samples_split=2, n_estimators=10, n_jobs=4)),
])

parameters = {
    'pca__n_components': range(100, 601, 100),
}

metric = dio.error_metric