def update_event(self, input_called=-1):
        if input_called == 0:
            clf = LinearSVR()
            if self.input(1) != None:
                clf.set_params(**self.input(1))
            X = self.input(2)
            y = self.input(3)

            clf.fit(X, y)
            self.set_output_val(1, clf)

            self.exec_output(0)
    class LinearSVRPermuteCoef:
        def __init__(self, **kwargs):
            self.model = LinearSVR(**kwargs)

        def fit(self, X, y):
            self.model.fit(X, y)

            self.coef_ = self.model.coef_
            self.intercept_ = self.model.intercept_

            def add_coef(arr, fn):
                arr.append(fn(self.coef_))

            add_coef(coeffs_state['max'], np.max)
            add_coef(coeffs_state['min'], np.min)

            return self

        def get_params(self, deep=True):
            return self.model.get_params(deep)

        def set_params(self, **kwargs):
            self.model.set_params(**kwargs)
            return self

        def predict(self, X):
            return self.model.predict(X)

        def score(self, X, y, sample_weight=None):
            if sample_weight is not None:
                return self.model.score(X, y, sample_weight)
            else:
                return self.model.score(X, y)

        @staticmethod
        def permute_min_coefs():
            return coeffs_state['min']

        @staticmethod
        def permute_max_coefs():
            return coeffs_state['max']

        @staticmethod
        def reset_perm_coefs():
            coeffs_state['min'] = []
            coeffs_state['max'] = []
    class LinearSVRPermuteCoef:
        def __init__(self, **kwargs):
            self.model = LinearSVR(**kwargs)

        def fit(self, X, y):
            self.model.fit(X, y)

            self.coef_ = self.model.coef_
            self.intercept_ = self.model.intercept_

            def add_coef(arr, fn):
                arr.append(fn(self.coef_))

            add_coef(coeffs_state['max'], np.max)
            add_coef(coeffs_state['min'], np.min)

            return self

        def get_params(self, deep=True):
            return self.model.get_params(deep)

        def set_params(self, **kwargs):
            self.model.set_params(**kwargs)
            return self

        def predict(self, X):
            return self.model.predict(X)

        def score(self, X, y, sample_weight=None):
            if sample_weight is not None:
                return self.model.score(X, y, sample_weight)
            else:
                return self.model.score(X, y)

        @staticmethod
        def permute_min_coefs():
            return coeffs_state['min']

        @staticmethod
        def permute_max_coefs():
            return coeffs_state['max']

        @staticmethod
        def reset_perm_coefs():
            coeffs_state['min'] = []
            coeffs_state['max'] = []
Exemplo n.º 4
0
class Learner:
    """ Class responsible for training models, finding the best fit and making
        rate predictions based on the best fit model.
    """
    def __init__(self, instrument, predictor):
        """ Initialize the Learner class based on a predictor and instrument.

            Args:
                instrument: Instrument object.
                predictor: Predictor object.
        """
        self.instrument = instrument
        self.predictor = predictor
        self.init_learning_model()

    def init_learning_model(self):
        """ Initialize the learning model according to the given predictor.

            Args:
                None.
        """
        if self.predictor.name == 'treeRegressor':
            self.model = DecisionTreeRegressor()
        if self.predictor.name == 'linearSVMRegressor':
            self.model = LinearSVR()

    def get_training_samples(self, end_date):
        """ Retrieve all training samples before the end date.

            Args:
                before: Date object. Retrieve training samples before end_date.

            Returns:
                all_samples: List of TrainingSample.
        """
        last_date = None
        if end_date is not None:
            last_date = end_date - datetime.timedelta(1)
        all_samples = ts.get_samples(instrument=self.instrument,
                                     end=last_date,
                                     order_by=['date'])
        return all_samples

    def learn(self, **kwargs):
        """ Use the training samples for the given instrument to build a
            learning model for the learner.

            Args:
                Named arguments.
                    cv_fold: Integer. Number of folds for cross validation.
                    before: Date object. Use samples before this date.

            Returns:
                best_score: float. Best cross validation score from learning.
        """
        cv_fold = kwargs.get('cv_fold')
        end_date = kwargs.get('before')

        all_training_samples = self.get_training_samples(end_date)
        features = [x.features for x in all_training_samples]
        targets = [x.target for x in all_training_samples]

        self.model.set_params(**self.predictor.parameters)
        scores = cross_val_score(self.model, features, targets, cv=cv_fold)
        ave_score = sum(scores) / len(scores)

        self.model.fit(features, targets)

        return ave_score

    def predict(self, features):
        """ Use trained model to predict profitable change given the features.

            Args:
                features: List of floats.

            Returns:
                Decimal. Predicted profitable change.
        """
        features = np.asarray(features).reshape(1, -1)
        predicted = self.model.predict(features)
        return decimal.Decimal(float(predicted)).quantize(TWO_PLACES)
Exemplo n.º 5
0
class TextRegressor:
    param_defaults = {'min_df': 1, 'c_ngmin': 1, 'c_ngmax': 1,
                      'w_ngmax': 1, 'w_ngmin': 1, 'lowercase': 'word',
                      'alpha': 1.0, 'C': 1.0, 'mix': 1.0}
    def __init__(self, regressor='ridge', vectorizer='tf-idf'):
        if regressor == 'ridge':
            from sklearn.linear_model import Ridge
            self.reg = Ridge()
        elif regressor == 'SVR':
            from sklearn.svm import SVR
            self.reg = SVR()
        elif regressor == 'linearsvr':
            from sklearn.svm import LinearSVR
            self.reg = LinearSVR()
        if vectorizer == 'tf-idf':
            from sklearn.feature_extraction.text import TfidfVectorizer
            self.vec = TfidfVectorizer()
        self.vec_params_default = self.vec.get_params()
        self.reg_params_default = self.reg.get_params()
        self._reset()

    def _reset(self):
        self.par = dict(self.param_defaults)
        self.vec_params = self.vec_params_default
        self.vec.set_params(**self.vec_params)
        self.reg_params = self.reg_params_default
        self.reg.set_params(**self.reg_params)

    def set_params(self, **params):
        self._reset()
        self.par.update(params)
        ngram_analyzer = DocAnalyzer(
                    lowercase=self.par.get('lowercase'),
                    c_ngmin=self.par.get('c_ngmin'),
                    c_ngmax=self.par.get('c_ngmax'),
                    w_ngmin=self.par.get('w_ngmin'),
                    w_ngmax=self.par.get('w_ngmax'))
        self.vec_params.update(
            {k:self.par[k] for k in self.par.keys() & self.vec_params.keys()})
        self.vec.set_params(**self.vec_params)
        self.vec.set_params(analyzer=ngram_analyzer)
        self.reg_params.update(
            {k:self.par[k] for k in self.par.keys() & self.reg_params.keys()})
        self.reg.set_params(**self.reg_params)

    def get_params(self):
        return self.par

    def fit(self, text, outcome):
        num = None
        if len(text) == 2:
            text, num = text
        x = self.vec.fit_transform(text)
        if num is not None:
            x = hstack((x, self.par['mix'] * num), format='csr')
        self.reg.fit(x, outcome)

    def predict(self, text,
                gold=None, gold_rank=None, rank_dir=-1, return_score=False):
        num = None
        if len(text) == 2:
            text, num = text
        x = self.vec.transform(text)
        if num is not None:
            x = hstack((x, self.par['mix'] * num), format='csr')
        pred = self.reg.predict(x)
        if return_score:
            return pred, self._score(gold, pred, gold_rank, rank_dir)
        else:
            return pred

    def _score(self, gold, pred, gold_rank=None, rank_dir=-1,
            verbose=False):
        r2 = r2_score(gold, pred)
        rmse = np.sqrt(mean_squared_error(gold, pred))
        if gold_rank is None:
            gold_rank = rankdata(rank_dir * gold, method='ordinal')
        pred_rank = rankdata(rank_dir * pred, method='ordinal')
        corr, _ = pearsonr(gold, pred)
        rank_corr, _ = pearsonr(gold_rank, pred_rank)
        if verbose:
            fmt = ("{}: n={}, min={:.4f}, max={:.4f}, mean={:.4f}, "
                   "var={:.4f}, skew={:.4f}, kurtosis={:.4f}")
            gold_dsc = describe(gold)
            pred_dsc = describe(pred)
            print(fmt.format('gold',
                gold_dsc[0], *gold_dsc[1], *gold_dsc[2:]))
            print(fmt.format('pred',
                pred_dsc[0], *pred_dsc[1], *pred_dsc[2:]))
        return {'r2': r2, 'rmse': rmse, 'rank_corr': rank_corr, 'corr': corr}

    def score(self, text, gold, gold_rank=None, rank_dir=-1,
            verbose=False):
        pred = self.predict(text)
        return self._score(gold, pred, gold_rank, rank_dir,
                verbose=verbose)
Exemplo n.º 6
0
class Baseline:
    def __init__(self, city, dest_name):
        self.city = city
        self.dest_name = dest_name
        print 'Baseline implementation for {:s} : {:s}'.format(
            self.city, self.dest_name)
        dest_to_idx = {
            'bofa': 0,
            'church': 1,
            'gas_station': 3,
            'high_school': 3,
            'mcdonalds': 4
        }
        self.idx = dest_to_idx[self.dest_name]
        self.base_dir = osp.join('../data/dataset', city)
        self.train_label_filename = osp.join(self.base_dir, 'distance',
                                             'train_labels.h5')
        self.train_im_list_filename = osp.join(self.base_dir, 'distance',
                                               'train_im_list.txt')
        self.test_label_filename = osp.join(self.base_dir, 'distance',
                                            'test_labels.h5')
        self.test_im_list_filename = osp.join(self.base_dir, 'distance',
                                              'test_im_list.txt')
        self.svr = LinearSVR(verbose=1,
                             epsilon=0,
                             dual=False,
                             tol=1e-3,
                             max_iter=50000,
                             loss='squared_epsilon_insensitive')
        self.scaler = StandardScaler(copy=False)
        self.model_filename = osp.join(self.base_dir, 'distance',
                                       '{:s}.pkl'.format(self.dest_name))

    def collect_train_data(self):
        with open(self.train_im_list_filename, 'r') as train_f_im:
            train_im_names = [l.rstrip() for l in train_f_im]

        print 'Loading train data...'
        with h5py.File('../data/dataset/train_feats1.mat', 'r') as f:
            self.train_X = np.asarray(f['train_features'], dtype=np.float32).T

        with h5py.File(self.train_label_filename, 'r') as train_f_label:
            self.train_y = train_f_label['label'][:,
                                                  self.idx].astype(np.float32)

        # select cities and remove rogue labels
        # idx = [i for i,n in enumerate(train_im_names) if ((('boston' in n)) and self.train_y[i] < 1e3)]
        idx = [
            i for i, n in enumerate(train_im_names) if self.train_y[i] < 1e3
        ]

        self.train_X = self.train_X[idx, :]
        self.train_y = self.train_y[idx]

        assert (self.train_y.shape[0] == self.train_X.shape[0])
        print 'Done, using {:d} images for training'.format(
            self.train_X.shape[0])

    def train(self, C=1.0):
        print 'Scaling...'
        self.train_X = self.scaler.fit_transform(self.train_X)
        print 'Training with C = {:f}'.format(C)
        p = self.svr.get_params()
        p['C'] = C
        self.svr.set_params(**p)
        self.svr.fit(self.train_X, self.train_y)

    def save_predictions(self):
        with h5py.File('../data/dataset/test_feats.mat', 'r') as f:
            print 'Loading feats...'
            self.test_X = np.asarray(f['test_features'], dtype=np.float32).T

        with open('../data/dataset/test_filenames.txt', 'r') as f:
            im_names = [n.rstrip() for n in f]
        keys = [get_key(im_name) for im_name in im_names]

        assert (len(im_names) == self.test_X.shape[0])

        print 'Loading models...'
        d = joblib.load(self.model_filename)
        self.svr = d['svr']
        self.scaler = d['scaler']

        print 'Scaling...'
        self.test_X = self.scaler.transform(self.test_X)
        print 'Predicting...'
        preds = self.svr.predict(self.test_X)
        print 'Done!'
        pred_dict = {key: pred for (key, pred) in zip(keys, preds)}
        fn = '../data/dataset/test_preds_{:s}.pk'.format(self.dest_name)
        with open(fn, 'w') as f:
            pickle.dump(pred_dict, f)
        print 'Saved', fn

    def save_current_model(self):
        joblib.dump({
            'svr': self.svr,
            'scaler': self.scaler
        }, self.model_filename)
        print self.model_filename, 'saved'
Exemplo n.º 7
0
    params = {'max_features':['auto','sqrt','log2']}
    RF_model = GridSearchCV(RF_est, params)
    RF_model.fit(X,y)
    print('Best {}'.format(RF_model.best_params_))


    print('Performing grid search on GBR')
    n_features = X.shape[1]
    params = {'max_features':['auto','sqrt','log2'],
              'max_depth':[2, 3]}
    GBR_model = GridSearchCV(GBR_est, params)
    GBR_model.fit(X,y)
    print('Best {}'.format(GBR_model.best_params_))
else:
    Lin_model = Lin_est.set_params(alpha=100.0)
    SVR_model = svr_est.set_params(C=1.0)
    RF_model = RF_est.set_params(max_features='auto')
    GBR_model = GBR_est.set_params(max_features='auto',
                                    max_depth=3)


#%% Specify set of models to test
model_set = [('Null',LCM.rand_pick_mod()),
            ('Lin', Lin_model),
            ('Lin_SVR',SVR_model),
            ('GBR',GBR_model),
            ('RF', RF_model)]
# model_set = [('Null',LCM.rand_pick_mod()),
#             ('Lin', Lin_model),
#              ('RF', RF_model)]
Exemplo n.º 8
0
def split_and_encode_Xy(X,
                        y,
                        encoding='le',
                        feat_scaler=True,
                        tgt_scaler=True,
                        freqs=None,
                        dummy_cols=10,
                        ohe_dates=False,
                        test_size=.25,
                        feat_select=True,
                        shuffle=True,
                        enc_Xy=False,
                        X_test=None,
                        scoring='r2'):
    """
    Splits X, y into train and test sub sets, encode them

    ---

    shuffle: set it to False to preserve items order
    """
    X_train, y_train, y_test = (None, None, None)
    # do not shuffle the data before splitting to respect row order
    if not enc_Xy:
        # check X, y are valid dataframes or numpy arrays...
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, shuffle=shuffle)
    else:
        print()
        print("Encoding full data set 'X' -> 'X_train'")
        X_train = X
        y_train = y

    print("Let's have a look at the first row and output")
    print("X_train\n", X_train.head())
    print("y_train\n", y_train.head())
    print()

    if list(X.select_dtypes(include=["datetime"]).columns):
        print("datetime type found.")
        X_train = lc.get_date_features(X_train, freqs)
        if X_test is not None:
            X_test = lc.get_date_features(X_test, freqs)

        # print(X_train["Month"].head(3))

    if encoding == 'le':
        X_train = lc.dummy_encode(X_train.copy()).astype(np.float32)
        if X_test is not None:
            X_test = lc.dummy_encode(X_test.copy()).astype(np.float32)
    elif encoding == 'ohe':
        # do this for mixed label-onehot encoding !
        # X_train.reset_index(drop=True, inplace=True)
        # X_test.reset_index(drop=True, inplace=True)

        X_train = lc.get_dummies_or_label_encode(X_train.copy(),
                                                 dummy_cols=dummy_cols,
                                                 ohe_dates=ohe_dates).astype(
                                                     np.float32)
        # print("oheencoded X_train['month'] \n", X_train["Month"].head(3))

        if X_test is not None:
            X_test = lc.get_dummies_or_label_encode(
                X_test.copy(), dummy_cols=dummy_cols,
                ohe_dates=ohe_dates).astype(np.float32)

            X_test = eu.reorder_ohencoded_X_test_columns(X_train, X_test)
    else:
        raise ValueError("%r is not a valid value for var 'encoding', \n"
                         "valid values are in ['le', 'ohe']" % encoding)

    print()

    if X_train.isnull().values.any():
        X_train = X_train.fillna(X_train.median())

    if X_test is not None and X_test.isnull().values.any():
        X_test = X_test.fillna(X_test.median())

    print("After encoding, first row and output")
    print("X_train\n", X_train.head())
    print("X_train.columns\n", list(X_train.columns))
    print("y_train\n", y_train.head())
    print()

    scalers = (None, None)
    data_and_scalers = {"scalers": scalers}

    if feat_scaler:

        print("scaling train and test data")

        scaler = StandardScaler()
        # you're going to perform scaling at training time before finalization
        if not enc_Xy:
            X_train_scaled = scaler.fit_transform(X_train)
            X_train = DataFrame(data=X_train_scaled,
                                columns=X_train.columns,
                                index=X_train.index)

            print()
            print("X_train shape:", X_train.shape)
            if X_test is not None:
                X_test_scaled = scaler.transform(X_test)
                X_test = DataFrame(data=X_test_scaled,
                                   columns=X_test.columns,
                                   index=X_test.index)
                print("X_test shape:", X_test.shape)

            print()
            print("After scaling...")
            print("X_train\n", X_train[:1])
            print("X_train type", type(X_train))
            if X_test is not None:
                print("X_test\n", X_test[:1])
                print("X_test type", type(X_test))
            print()

        scalers = (scaler, None)
        data_and_scalers["scalers"] = scalers

    print("scoring:", scoring)
    # tgt_scaler = False if scoring == 'neg_rmsle' else True
    # standard scaling introduces negative values,
    # which can't be fed to log, hence to rmsle

    if tgt_scaler:
        print("Scaling target...")

        if scoring != 'neg_rmsle':
            y_scaler = StandardScaler()
            y_train = y_scaler.fit_transform(y_train.values.reshape(
                -1, 1)).ravel()
        else:
            y_scaler = MinMaxScaler()
            y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))

        print("y_train and its type\n", (y_train[:1], type(y_train)))

        if not enc_Xy:
            if scoring != 'neg_rmsle':
                y_test = y_scaler.transform(y_test.values.reshape(-1,
                                                                  1)).ravel()
            else:
                y_test = y_scaler.fit_transform(y_test.values.reshape(-1, 1))

            print("y_test and its type\n", (y_test[:3], type(y_test)))

        scalers = (scalers[0], y_scaler)
        data_and_scalers["scalers"] = scalers

        print()

    # this works for classifiers
    # featsel_tuple = eu.create_feature_selector(X_train, None, seed)

    if feat_select and X_train.shape[1] > 10:

        lsvr = LinearSVR(max_iter=1e4)
        lsvr = lsvr.set_params(C=0.01,
                               loss="squared_epsilon_insensitive",
                               dual=False)
        # threshold=[1e-2, 1e-1] or in ["mean", "median"]
        thsd = "median"  # "median", "median"
        featselector = SelectFromModel(lsvr, threshold=thsd)
        # tscv_fs = TimeSeriesSplit(n_splits=5)
        # featselector = RFECV(lsvr, step=1, cv=tscv_fs)

        data_and_scalers["f_selector"] = featselector

        if not enc_Xy:
            # featselector = featsel_tuple[1]
            X_train_selected = featselector.fit_transform(X_train, y_train)
            xtr_indices = featselector.get_support()
            X_train = DataFrame(data=X_train_selected,
                                columns=X_train.columns[xtr_indices],
                                index=X_train.index)

            print("After feature selection...")
            print("X_train shape:", X_train.shape)
            if X_test is not None:
                X_test_selected = featselector.transform(X_test)
                xtt_indices = featselector.get_support()
                X_test = DataFrame(data=X_test_selected,
                                   columns=X_test.columns[xtt_indices],
                                   index=X_test.index)

                print("X_test shape:", X_test.shape)

    data_and_scalers["data"] = (X_train, X_test, y_train, y_test)

    return data_and_scalers
Exemplo n.º 9
0
    print('Performing grid search on RF')
    n_features = X.shape[1]
    params = {'max_features': ['auto', 'sqrt', 'log2']}
    RF_model = GridSearchCV(RF_est, params)
    RF_model.fit(X, y)
    print('Best {}'.format(RF_model.best_params_))

    print('Performing grid search on GBR')
    n_features = X.shape[1]
    params = {'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [2, 3]}
    GBR_model = GridSearchCV(GBR_est, params)
    GBR_model.fit(X, y)
    print('Best {}'.format(GBR_model.best_params_))
else:
    Lin_model = Lin_est.set_params(alpha=100.0)
    SVR_model = svr_est.set_params(C=1.0)
    RF_model = RF_est.set_params(max_features='auto')
    GBR_model = GBR_est.set_params(max_features='auto', max_depth=3)

#%% Specify set of models to test
model_set = [('Null', LCM.rand_pick_mod()), ('Lin', Lin_model),
             ('Lin_SVR', SVR_model), ('GBR', GBR_model), ('RF', RF_model)]
# model_set = [('Null',LCM.rand_pick_mod()),
#             ('Lin', Lin_model),
#              ('RF', RF_model)]

leg_titles = {
    'Null': 'Random\nPicking',
    'Lin': 'Linear\nModel',
    'Lin_SVR': 'Linear SVM',
    'GBR': 'Gradient\nBoosting',