def fit(self, tree, model_name): self.initialize(model_name) self.load() self.log_params() if model_name != 'DT': model = tree(n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, verbose=self.verbose) else: model = tree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features) model.fit(self.X_train.values, self.Y_train.values) print(f"{model_name} is fitted") if self.should_cross_val: scores = cross_val_score(model, self.X, self.Y, cv=self.k, verbose=0) self.log.info( f"---- Cross validation with {self.k} groups----\n\nThe results on each split" + str(scores) + "\n") self.log.info( f"The average of the cross validation is {np.mean(scores):.2f}\n" ) print( f"|- Cross validation is done for {model_name}. Accuracy: {np.mean(scores):.2f} -|" ) evaluate_classification( ['OnTrain', self.X_train, self.Y_train, self.dates_train], ['OnTest', self.X_test, self.Y_test, self.dates_test], direc=self.directory, model=model, model_name=model_name, logger=self.log, slicer=1) joblib.dump(model, self.directory + f"/{model_name}.pkl") plot_roc( pd.get_dummies(self.Y_test, drop_first=False).values, model.predict_proba(self.X_test), self.dl.classes_, self.directory) # Plotting the Importances report_feature_importance(self.directory, model.feature_importances_, self.X, self.Y, self.n_top_features, model_name, self.log)
def fit(self): model = LogisticRegression(C = 100, fit_intercept = True, penalty= 'l1', solver = 'liblinear') model.fit(self.X_train, self.Y_train) self.dl.log.info("---Model Coeffs---\n" + str(model.coef_)) coeffs = {} for i in range (len(model.coef_[0])): coeffs[self.X.columns[i]] = model.coef_[0][i] coeffs['c'] = model.intercept_[0] evaluate_classification(['OnTrain', self.X_train, self.Y_train, self.dates_train], ['OnTest', self.X_test, self.Y_test, self.dates_test], direc = self.directory, model = model, model_name = self.model_name, logger = self.log, slicer = 1) plot_roc(pd.get_dummies(self.Y_test, drop_first = False).values, model.predict_proba(self.X_test), self.dl.classes_, self.directory) joblib.dump(model, self.directory + f"/Logit.pkl") # Plotting the Importances report_feature_importance(self.directory, model.coef_[0], self.X, self.Y, self.n_top_features, "Logit", self.log)
def fit(self): self.set_params() self.log_params() self.model = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma) self.model.fit(self.X_train, self.Y_train) evaluate_classification( ['OnTrain', self.X_train, self.Y_train, self.dates_train], ['OnTest', self.X_test, self.Y_test, self.dates_test], direc=self.directory, model=self.model, model_name=self.model_name, logger=self.log, slicer=1) joblib.dump(self.model, self.directory + f"/{self.model_name}.pkl") # Plotting the Importances if self.kernel == 'linear': report_feature_importance(self.directory, self.model.coef_[0], self.X_train.columns, self.n_top_features, self.model_name, self.log)
def fit(self, n = 5): self.log.info(f'KNN Classifier is about to be fit on {self.name} with n = {n}') model = KNeighborsClassifier(n_neighbors=n, n_jobs = -1) model.fit(self.X_train, self.Y_train) evaluate_classification(['OnTrain', self.X_train, self.Y_train, self.dates_train], ['OnTest', self.X_test, self.Y_test, self.dates_test], direc = self.directory, model = model, model_name = model_name, logger = self.log, slicer = 1)
def get_report(self): self.load_model() y_train_pred = self.predict_set(self.X_train) y_test_pred = self.predict_set(self.X_test) evaluate_classification(['OnTrain', self.X_train, self.Y_original_train, self.dates_train, y_train_pred], ['OnTest', self.X_test, self.Y_original_test, self.dates_test, y_test_pred], direc = self.directory, model = self.model, model_name = self.model_name, logger = self.log, slicer = 1)
def fit(self): value_counts = self.Y.value_counts() value_counts = value_counts / value_counts.sum() y_pred_train = np.random.choice(value_counts.index, size=len(self.Y_train), p=value_counts.values) y_pred_test = np.random.choice(value_counts.index, size=len(self.Y_test), p=value_counts.values) evaluate_classification([ 'OnTrain', self.X_train, self.Y_train, self.dates_train, y_pred_train ], ['OnTest', self.X_test, self.Y_test, self.dates_test, y_pred_test], direc=self.directory, model_name=self.model_name, logger=self.log, slicer=1)