示例#1
0
 def fit(self, X, y, sample_weight=None, **fit_params):
     log.info("hyperparameter optimizing: " + str(self.model))
     bo = bayes_opt.BayesianOptimization(functools.partial(cv_weighted_instantiated_model, self.model, X, y, sample_weight, self.kf, [self.metric], self.parallel),
                                  self.hyperparameter_bounds)
     bo.maximize(init_points=5, n_iter = 30, acq="ei", xi=1e-4) #go greedy (low xi) b/c this takes a long time
     optimal_hyperparameters = {hyperparameter: bo.res["max"]["max_params"][hyperparameter].astype(type(self.hyperparameter_bounds[hyperparameter][0])) for hyperparameter in self.hyperparameter_bounds}
     log.info("optimal: " + str(optimal_hyperparameters))
     self.model.set_params(**optimal_hyperparameters)
     self.model.fit(X,y,sample_weight=sample_weight)
     return self.model
示例#2
0
def fit_model_(model, X, y, **fit_params):
    ''' Fits a pickled model.  Handy for multiprocessing and such '''
    import ml_battery.log as log
    name, estimator = model
    log.info("training " + name)
    start = time.time()
    estimator = pickle.loads(estimator)
    estimator = estimator.fit(X,y,**fit_params) # in case of hyperparameter optimized thingies, this returns the underlying estimator
    estimator = pickle.dumps(estimator)
    end = time.time()
    log.info("finished training " + name)
    return ((name, estimator), (name, end-start))
示例#3
0
 def fit(self, X, y, **fit_params):
     if self.parallel:
         fitted_models_and_times = joblib.Parallel(n_jobs=32)(joblib.delayed(fit_model_)(model, X, y, **fit_params) for model in pickle_estimators(self.estimators))
         fitted_models, times = zip(*fitted_models_and_times)
         self.estimators, self.fit_times = unpickle_estimators(fitted_models), dict(times) 
         return self
     else:
         self.fit_times = {}
         for i, (name, estimator) in enumerate(self.estimators):
             log.info("training: " + name)
             log.info("opened sessions: " + n_opened_sessions())
             start = time.time()
             self.estimators[i] = (name, estimator.fit(X,y,**fit_params)) # in case of hyperparameter optimized thingies, this returns the underlying estimator
             end = time.time()
             self.fit_times[name] = end-start
         return self
示例#4
0
 def fit(self, X, y=None,**fit_params):
     # y is ignored
     log.info("fitting codebook")
     self.label_encoders = {}
     self.onehot_encoders = {}
     self.numeric_columns = []
     encoder_X = self.X_possible_values if self.X_possible_values is not None else X
     for col in X.columns:
         if (col in self.codebook.index) and (self.codebook.loc[col]["Type"] == "C"):
             label_encoder = sklearn.preprocessing.LabelEncoder().fit(encoder_X[[col]].astype(str))
             onehot_encoder = sklearn.preprocessing.OneHotEncoder().fit(np.arange(len(label_encoder.classes_)).reshape((-1,1)))
             self.label_encoders[col] = label_encoder
             self.onehot_encoders[col] = onehot_encoder
         else:
             self.numeric_columns.append(col)
     if self.scale and self.numeric_columns:
     
         self.scaler = sklearn.preprocessing.MinMaxScaler()
         self.scaler.fit(X[self.numeric_columns])
     return self
示例#5
0
 def transform(self, X):
     onehotted_dataframes = []
     for col in X.columns:
         log.info("transforming column: " + str(col))
         if (col in self.codebook.index) and (self.codebook.loc[col]["Type"] == "C"):
             onehot_encoded = self.onehot_encoders[col].transform(
                 self.label_encoders[col].transform(
                     X[[col]].astype(str)
                 ).reshape((-1,1))).toarray()
             onehotted_dataframes.append(
                 pd.DataFrame(
                     onehot_encoded, 
                     columns = [col + self.sep + cls for cls in self.label_encoders[col].classes_],
                     index = X.index
                 ))
     X = X.drop(self.onehot_encoders, axis=1)
     if self.scale and self.numeric_columns:
         X = pd.DataFrame(self.scaler.transform(X[self.numeric_columns]),columns=self.numeric_columns,index=X.index)
         
     return X.join(onehotted_dataframes)
示例#6
0
    def fit_(self, X, y, sample_weight=None, feed_dict_extras={}):
        ''' Trains for a number of epochs.  Model input must be in self.model.x, output in self.model.y, loss in self.model.loss, and training using self.model.train_step '''
        if not hasattr(self, "batch_size"):
            self.batch_size = None

        for epoch in range(self.n_epochs):
            batcher = np_batcher(X.shape[0], self.batch_size)
            for batch in batcher:
                feed_dict = {
                    self.model.x: X[batch],
                    self.model.y: y[batch],
                    self.model.sample_weight: sample_weight[batch]
                }
                feed_dict.update(feed_dict_extras)
                loss, _ = self.sess.run(
                    (self.model.loss, self.model.train_step), feed_dict)

                log.info("epoch: " + str(epoch) + " :::: loss: " +
                         str(loss.sum()))

        return self
示例#7
0
    def fit(self, X, y, sample_weight=None):
        start = time.time()
    
        self.set_output_shape_(y)
        #first, fit the multiestimator, to do the whole hyperparameter thing
        log.info("training multiestimator")
        self.multiestimator.fit(X,y,sample_weight=sample_weight)
        
        #check X,y,sample_weight.... the built in cv splitting stuff doesn't like pandas shit
        X,y = sklearn.utils.check_X_y(X,y)
        
        if sample_weight is not None:
            sample_weight = np.array(sample_weight)
            
        self.kfold.get_n_splits(X)
        #for each cv split, train the models on the train splits, predict on the test split, and keep those as new features for the meta estimator
        new_features = np.zeros((X.shape[0], len(self.multiestimator.estimators)*self.output_shape_))
        for train_index, test_index in self.kfold.split(X,y):
            X_meta_train, X_meta_test = X[train_index], X[test_index]
            y_meta_train, y_meta_test = y[train_index], y[test_index]
            if sample_weight is not None:
                sample_weight_meta_train, sample_weight_meta_test = sample_weight[train_index], sample_weight[test_index]
            else:
                sample_weight_meta_train, sample_weight_meta_test = None, None
            cloned_multi = sklearn.base.clone(self.multiestimator)
            log.info("cv training")
            cloned_multi.fit(X_meta_train, y_meta_train, sample_weight=sample_weight_meta_train)
            new_features[test_index] = self.get_stacked_features_(cloned_multi, X_meta_test) 
            cloned_multi.cleanup() #clean up temporary estimators

        new_X = np.hstack((X,new_features))
        log.info("training metaestimator")
        self.metaestimator.fit(new_X,y,sample_weight)
        log.info("finished training metaestimator")
        
        end = time.time()
        self.fit_time = end-start
        
        return self   
示例#8
0
 def transform(self, X, *args, **kwargs):
     log.info("performing feature selection")
     new_X = super().transform(X, *args, **kwargs)
     return pd.DataFrame(new_X, columns=X.columns[self.get_support()])
示例#9
0
 def __setstate__(self, d):
     log.info("unpickling tf model")
     if "model" in d:
         model = d["model"]
         del d["model"]
         self.__dict__.update(d)
         name = PickleableTFModel.TEMP_MODEL_FILE_ + str(random.random())
         path = os.path.join(os.getcwd(), name)
         try:
             log.info("writing model bytes")
             with open(path, "wb") as f:
                 f.write(model)
             log.info("extracting zip file")
             with zipfile.ZipFile(path, "r") as zf:
                 zf.extractall()
             ckpt = ".".join(zf.namelist()[0].split(".")[:-1])
             log.info("building model")
             self.build_model()
             with self.model.graph_.as_default():
                 saver = tf.train.Saver(self.__getvariables__())
                 log.info("creating session")
                 self.sess = tf.Session(config=TF_CONFIG_,
                                        graph=self.model.graph_)
                 log.info("restoring checkpoint: " + ckpt)
                 saver.restore(self.sess, os.path.join(".", ckpt))
             log.info("tf model restored!")
         except Exception as e:
             log.info(e)
         finally:
             log.info("destroying the evidence")
             files = zf.namelist()
             os.remove(path)
             for f in files:
                 os.remove(f)
     else:
         self.__dict__.update(d)