def update_event(self, input_called=-1): if input_called == 0: clf = LinearSVR() if self.input(1) != None: clf.set_params(**self.input(1)) X = self.input(2) y = self.input(3) clf.fit(X, y) self.set_output_val(1, clf) self.exec_output(0)
class LinearSVRPermuteCoef: def __init__(self, **kwargs): self.model = LinearSVR(**kwargs) def fit(self, X, y): self.model.fit(X, y) self.coef_ = self.model.coef_ self.intercept_ = self.model.intercept_ def add_coef(arr, fn): arr.append(fn(self.coef_)) add_coef(coeffs_state['max'], np.max) add_coef(coeffs_state['min'], np.min) return self def get_params(self, deep=True): return self.model.get_params(deep) def set_params(self, **kwargs): self.model.set_params(**kwargs) return self def predict(self, X): return self.model.predict(X) def score(self, X, y, sample_weight=None): if sample_weight is not None: return self.model.score(X, y, sample_weight) else: return self.model.score(X, y) @staticmethod def permute_min_coefs(): return coeffs_state['min'] @staticmethod def permute_max_coefs(): return coeffs_state['max'] @staticmethod def reset_perm_coefs(): coeffs_state['min'] = [] coeffs_state['max'] = []
class LinearSVRPermuteCoef: def __init__(self, **kwargs): self.model = LinearSVR(**kwargs) def fit(self, X, y): self.model.fit(X, y) self.coef_ = self.model.coef_ self.intercept_ = self.model.intercept_ def add_coef(arr, fn): arr.append(fn(self.coef_)) add_coef(coeffs_state['max'], np.max) add_coef(coeffs_state['min'], np.min) return self def get_params(self, deep=True): return self.model.get_params(deep) def set_params(self, **kwargs): self.model.set_params(**kwargs) return self def predict(self, X): return self.model.predict(X) def score(self, X, y, sample_weight=None): if sample_weight is not None: return self.model.score(X, y, sample_weight) else: return self.model.score(X, y) @staticmethod def permute_min_coefs(): return coeffs_state['min'] @staticmethod def permute_max_coefs(): return coeffs_state['max'] @staticmethod def reset_perm_coefs(): coeffs_state['min'] = [] coeffs_state['max'] = []
class Learner: """ Class responsible for training models, finding the best fit and making rate predictions based on the best fit model. """ def __init__(self, instrument, predictor): """ Initialize the Learner class based on a predictor and instrument. Args: instrument: Instrument object. predictor: Predictor object. """ self.instrument = instrument self.predictor = predictor self.init_learning_model() def init_learning_model(self): """ Initialize the learning model according to the given predictor. Args: None. """ if self.predictor.name == 'treeRegressor': self.model = DecisionTreeRegressor() if self.predictor.name == 'linearSVMRegressor': self.model = LinearSVR() def get_training_samples(self, end_date): """ Retrieve all training samples before the end date. Args: before: Date object. Retrieve training samples before end_date. Returns: all_samples: List of TrainingSample. """ last_date = None if end_date is not None: last_date = end_date - datetime.timedelta(1) all_samples = ts.get_samples(instrument=self.instrument, end=last_date, order_by=['date']) return all_samples def learn(self, **kwargs): """ Use the training samples for the given instrument to build a learning model for the learner. Args: Named arguments. cv_fold: Integer. Number of folds for cross validation. before: Date object. Use samples before this date. Returns: best_score: float. Best cross validation score from learning. """ cv_fold = kwargs.get('cv_fold') end_date = kwargs.get('before') all_training_samples = self.get_training_samples(end_date) features = [x.features for x in all_training_samples] targets = [x.target for x in all_training_samples] self.model.set_params(**self.predictor.parameters) scores = cross_val_score(self.model, features, targets, cv=cv_fold) ave_score = sum(scores) / len(scores) self.model.fit(features, targets) return ave_score def predict(self, features): """ Use trained model to predict profitable change given the features. Args: features: List of floats. Returns: Decimal. Predicted profitable change. """ features = np.asarray(features).reshape(1, -1) predicted = self.model.predict(features) return decimal.Decimal(float(predicted)).quantize(TWO_PLACES)
class TextRegressor: param_defaults = {'min_df': 1, 'c_ngmin': 1, 'c_ngmax': 1, 'w_ngmax': 1, 'w_ngmin': 1, 'lowercase': 'word', 'alpha': 1.0, 'C': 1.0, 'mix': 1.0} def __init__(self, regressor='ridge', vectorizer='tf-idf'): if regressor == 'ridge': from sklearn.linear_model import Ridge self.reg = Ridge() elif regressor == 'SVR': from sklearn.svm import SVR self.reg = SVR() elif regressor == 'linearsvr': from sklearn.svm import LinearSVR self.reg = LinearSVR() if vectorizer == 'tf-idf': from sklearn.feature_extraction.text import TfidfVectorizer self.vec = TfidfVectorizer() self.vec_params_default = self.vec.get_params() self.reg_params_default = self.reg.get_params() self._reset() def _reset(self): self.par = dict(self.param_defaults) self.vec_params = self.vec_params_default self.vec.set_params(**self.vec_params) self.reg_params = self.reg_params_default self.reg.set_params(**self.reg_params) def set_params(self, **params): self._reset() self.par.update(params) ngram_analyzer = DocAnalyzer( lowercase=self.par.get('lowercase'), c_ngmin=self.par.get('c_ngmin'), c_ngmax=self.par.get('c_ngmax'), w_ngmin=self.par.get('w_ngmin'), w_ngmax=self.par.get('w_ngmax')) self.vec_params.update( {k:self.par[k] for k in self.par.keys() & self.vec_params.keys()}) self.vec.set_params(**self.vec_params) self.vec.set_params(analyzer=ngram_analyzer) self.reg_params.update( {k:self.par[k] for k in self.par.keys() & self.reg_params.keys()}) self.reg.set_params(**self.reg_params) def get_params(self): return self.par def fit(self, text, outcome): num = None if len(text) == 2: text, num = text x = self.vec.fit_transform(text) if num is not None: x = hstack((x, self.par['mix'] * num), format='csr') self.reg.fit(x, outcome) def predict(self, text, gold=None, gold_rank=None, rank_dir=-1, return_score=False): num = None if len(text) == 2: text, num = text x = self.vec.transform(text) if num is not None: x = hstack((x, self.par['mix'] * num), format='csr') pred = self.reg.predict(x) if return_score: return pred, self._score(gold, pred, gold_rank, rank_dir) else: return pred def _score(self, gold, pred, gold_rank=None, rank_dir=-1, verbose=False): r2 = r2_score(gold, pred) rmse = np.sqrt(mean_squared_error(gold, pred)) if gold_rank is None: gold_rank = rankdata(rank_dir * gold, method='ordinal') pred_rank = rankdata(rank_dir * pred, method='ordinal') corr, _ = pearsonr(gold, pred) rank_corr, _ = pearsonr(gold_rank, pred_rank) if verbose: fmt = ("{}: n={}, min={:.4f}, max={:.4f}, mean={:.4f}, " "var={:.4f}, skew={:.4f}, kurtosis={:.4f}") gold_dsc = describe(gold) pred_dsc = describe(pred) print(fmt.format('gold', gold_dsc[0], *gold_dsc[1], *gold_dsc[2:])) print(fmt.format('pred', pred_dsc[0], *pred_dsc[1], *pred_dsc[2:])) return {'r2': r2, 'rmse': rmse, 'rank_corr': rank_corr, 'corr': corr} def score(self, text, gold, gold_rank=None, rank_dir=-1, verbose=False): pred = self.predict(text) return self._score(gold, pred, gold_rank, rank_dir, verbose=verbose)
class Baseline: def __init__(self, city, dest_name): self.city = city self.dest_name = dest_name print 'Baseline implementation for {:s} : {:s}'.format( self.city, self.dest_name) dest_to_idx = { 'bofa': 0, 'church': 1, 'gas_station': 3, 'high_school': 3, 'mcdonalds': 4 } self.idx = dest_to_idx[self.dest_name] self.base_dir = osp.join('../data/dataset', city) self.train_label_filename = osp.join(self.base_dir, 'distance', 'train_labels.h5') self.train_im_list_filename = osp.join(self.base_dir, 'distance', 'train_im_list.txt') self.test_label_filename = osp.join(self.base_dir, 'distance', 'test_labels.h5') self.test_im_list_filename = osp.join(self.base_dir, 'distance', 'test_im_list.txt') self.svr = LinearSVR(verbose=1, epsilon=0, dual=False, tol=1e-3, max_iter=50000, loss='squared_epsilon_insensitive') self.scaler = StandardScaler(copy=False) self.model_filename = osp.join(self.base_dir, 'distance', '{:s}.pkl'.format(self.dest_name)) def collect_train_data(self): with open(self.train_im_list_filename, 'r') as train_f_im: train_im_names = [l.rstrip() for l in train_f_im] print 'Loading train data...' with h5py.File('../data/dataset/train_feats1.mat', 'r') as f: self.train_X = np.asarray(f['train_features'], dtype=np.float32).T with h5py.File(self.train_label_filename, 'r') as train_f_label: self.train_y = train_f_label['label'][:, self.idx].astype(np.float32) # select cities and remove rogue labels # idx = [i for i,n in enumerate(train_im_names) if ((('boston' in n)) and self.train_y[i] < 1e3)] idx = [ i for i, n in enumerate(train_im_names) if self.train_y[i] < 1e3 ] self.train_X = self.train_X[idx, :] self.train_y = self.train_y[idx] assert (self.train_y.shape[0] == self.train_X.shape[0]) print 'Done, using {:d} images for training'.format( self.train_X.shape[0]) def train(self, C=1.0): print 'Scaling...' self.train_X = self.scaler.fit_transform(self.train_X) print 'Training with C = {:f}'.format(C) p = self.svr.get_params() p['C'] = C self.svr.set_params(**p) self.svr.fit(self.train_X, self.train_y) def save_predictions(self): with h5py.File('../data/dataset/test_feats.mat', 'r') as f: print 'Loading feats...' self.test_X = np.asarray(f['test_features'], dtype=np.float32).T with open('../data/dataset/test_filenames.txt', 'r') as f: im_names = [n.rstrip() for n in f] keys = [get_key(im_name) for im_name in im_names] assert (len(im_names) == self.test_X.shape[0]) print 'Loading models...' d = joblib.load(self.model_filename) self.svr = d['svr'] self.scaler = d['scaler'] print 'Scaling...' self.test_X = self.scaler.transform(self.test_X) print 'Predicting...' preds = self.svr.predict(self.test_X) print 'Done!' pred_dict = {key: pred for (key, pred) in zip(keys, preds)} fn = '../data/dataset/test_preds_{:s}.pk'.format(self.dest_name) with open(fn, 'w') as f: pickle.dump(pred_dict, f) print 'Saved', fn def save_current_model(self): joblib.dump({ 'svr': self.svr, 'scaler': self.scaler }, self.model_filename) print self.model_filename, 'saved'
params = {'max_features':['auto','sqrt','log2']} RF_model = GridSearchCV(RF_est, params) RF_model.fit(X,y) print('Best {}'.format(RF_model.best_params_)) print('Performing grid search on GBR') n_features = X.shape[1] params = {'max_features':['auto','sqrt','log2'], 'max_depth':[2, 3]} GBR_model = GridSearchCV(GBR_est, params) GBR_model.fit(X,y) print('Best {}'.format(GBR_model.best_params_)) else: Lin_model = Lin_est.set_params(alpha=100.0) SVR_model = svr_est.set_params(C=1.0) RF_model = RF_est.set_params(max_features='auto') GBR_model = GBR_est.set_params(max_features='auto', max_depth=3) #%% Specify set of models to test model_set = [('Null',LCM.rand_pick_mod()), ('Lin', Lin_model), ('Lin_SVR',SVR_model), ('GBR',GBR_model), ('RF', RF_model)] # model_set = [('Null',LCM.rand_pick_mod()), # ('Lin', Lin_model), # ('RF', RF_model)]
def split_and_encode_Xy(X, y, encoding='le', feat_scaler=True, tgt_scaler=True, freqs=None, dummy_cols=10, ohe_dates=False, test_size=.25, feat_select=True, shuffle=True, enc_Xy=False, X_test=None, scoring='r2'): """ Splits X, y into train and test sub sets, encode them --- shuffle: set it to False to preserve items order """ X_train, y_train, y_test = (None, None, None) # do not shuffle the data before splitting to respect row order if not enc_Xy: # check X, y are valid dataframes or numpy arrays... X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, shuffle=shuffle) else: print() print("Encoding full data set 'X' -> 'X_train'") X_train = X y_train = y print("Let's have a look at the first row and output") print("X_train\n", X_train.head()) print("y_train\n", y_train.head()) print() if list(X.select_dtypes(include=["datetime"]).columns): print("datetime type found.") X_train = lc.get_date_features(X_train, freqs) if X_test is not None: X_test = lc.get_date_features(X_test, freqs) # print(X_train["Month"].head(3)) if encoding == 'le': X_train = lc.dummy_encode(X_train.copy()).astype(np.float32) if X_test is not None: X_test = lc.dummy_encode(X_test.copy()).astype(np.float32) elif encoding == 'ohe': # do this for mixed label-onehot encoding ! # X_train.reset_index(drop=True, inplace=True) # X_test.reset_index(drop=True, inplace=True) X_train = lc.get_dummies_or_label_encode(X_train.copy(), dummy_cols=dummy_cols, ohe_dates=ohe_dates).astype( np.float32) # print("oheencoded X_train['month'] \n", X_train["Month"].head(3)) if X_test is not None: X_test = lc.get_dummies_or_label_encode( X_test.copy(), dummy_cols=dummy_cols, ohe_dates=ohe_dates).astype(np.float32) X_test = eu.reorder_ohencoded_X_test_columns(X_train, X_test) else: raise ValueError("%r is not a valid value for var 'encoding', \n" "valid values are in ['le', 'ohe']" % encoding) print() if X_train.isnull().values.any(): X_train = X_train.fillna(X_train.median()) if X_test is not None and X_test.isnull().values.any(): X_test = X_test.fillna(X_test.median()) print("After encoding, first row and output") print("X_train\n", X_train.head()) print("X_train.columns\n", list(X_train.columns)) print("y_train\n", y_train.head()) print() scalers = (None, None) data_and_scalers = {"scalers": scalers} if feat_scaler: print("scaling train and test data") scaler = StandardScaler() # you're going to perform scaling at training time before finalization if not enc_Xy: X_train_scaled = scaler.fit_transform(X_train) X_train = DataFrame(data=X_train_scaled, columns=X_train.columns, index=X_train.index) print() print("X_train shape:", X_train.shape) if X_test is not None: X_test_scaled = scaler.transform(X_test) X_test = DataFrame(data=X_test_scaled, columns=X_test.columns, index=X_test.index) print("X_test shape:", X_test.shape) print() print("After scaling...") print("X_train\n", X_train[:1]) print("X_train type", type(X_train)) if X_test is not None: print("X_test\n", X_test[:1]) print("X_test type", type(X_test)) print() scalers = (scaler, None) data_and_scalers["scalers"] = scalers print("scoring:", scoring) # tgt_scaler = False if scoring == 'neg_rmsle' else True # standard scaling introduces negative values, # which can't be fed to log, hence to rmsle if tgt_scaler: print("Scaling target...") if scoring != 'neg_rmsle': y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train.values.reshape( -1, 1)).ravel() else: y_scaler = MinMaxScaler() y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1)) print("y_train and its type\n", (y_train[:1], type(y_train))) if not enc_Xy: if scoring != 'neg_rmsle': y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel() else: y_test = y_scaler.fit_transform(y_test.values.reshape(-1, 1)) print("y_test and its type\n", (y_test[:3], type(y_test))) scalers = (scalers[0], y_scaler) data_and_scalers["scalers"] = scalers print() # this works for classifiers # featsel_tuple = eu.create_feature_selector(X_train, None, seed) if feat_select and X_train.shape[1] > 10: lsvr = LinearSVR(max_iter=1e4) lsvr = lsvr.set_params(C=0.01, loss="squared_epsilon_insensitive", dual=False) # threshold=[1e-2, 1e-1] or in ["mean", "median"] thsd = "median" # "median", "median" featselector = SelectFromModel(lsvr, threshold=thsd) # tscv_fs = TimeSeriesSplit(n_splits=5) # featselector = RFECV(lsvr, step=1, cv=tscv_fs) data_and_scalers["f_selector"] = featselector if not enc_Xy: # featselector = featsel_tuple[1] X_train_selected = featselector.fit_transform(X_train, y_train) xtr_indices = featselector.get_support() X_train = DataFrame(data=X_train_selected, columns=X_train.columns[xtr_indices], index=X_train.index) print("After feature selection...") print("X_train shape:", X_train.shape) if X_test is not None: X_test_selected = featselector.transform(X_test) xtt_indices = featselector.get_support() X_test = DataFrame(data=X_test_selected, columns=X_test.columns[xtt_indices], index=X_test.index) print("X_test shape:", X_test.shape) data_and_scalers["data"] = (X_train, X_test, y_train, y_test) return data_and_scalers
print('Performing grid search on RF') n_features = X.shape[1] params = {'max_features': ['auto', 'sqrt', 'log2']} RF_model = GridSearchCV(RF_est, params) RF_model.fit(X, y) print('Best {}'.format(RF_model.best_params_)) print('Performing grid search on GBR') n_features = X.shape[1] params = {'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [2, 3]} GBR_model = GridSearchCV(GBR_est, params) GBR_model.fit(X, y) print('Best {}'.format(GBR_model.best_params_)) else: Lin_model = Lin_est.set_params(alpha=100.0) SVR_model = svr_est.set_params(C=1.0) RF_model = RF_est.set_params(max_features='auto') GBR_model = GBR_est.set_params(max_features='auto', max_depth=3) #%% Specify set of models to test model_set = [('Null', LCM.rand_pick_mod()), ('Lin', Lin_model), ('Lin_SVR', SVR_model), ('GBR', GBR_model), ('RF', RF_model)] # model_set = [('Null',LCM.rand_pick_mod()), # ('Lin', Lin_model), # ('RF', RF_model)] leg_titles = { 'Null': 'Random\nPicking', 'Lin': 'Linear\nModel', 'Lin_SVR': 'Linear SVM', 'GBR': 'Gradient\nBoosting',