def main(): """ """ # data = credit.dataset_31_credit_g() data = wine.wine_quality_red_csv() print(data.columns) # column = data['volatile_acidity'].values.reshape(-1, 1) # column = data[].values.reshape(-1, 1) X, y = data['volatile_acidity'].values.reshape(-1, 1), data['class'] X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=0) sets = split(X_test, y_test, test_size=.5, random_state=0) X_first_half, X_second_half, y_first_half, y_second_half = sets # print(X_first_half.shape, X_second_half.shape) # X_train, X_test, y_train, y_test = split(X, y, # test_size=0.2, # random_state=0) pipeline = WineQualityPipeline() classifier = RandomForest() model = pipeline.with_estimator(classifier).fit(X_train, y_train) # prediction = model.predict(X_test) # pipeline = CreditGPipeline() shift_detector = SklearnDataShiftDetector(model, n_bins=30) shift_detector.iteration(X_first_half) new_second_half = deepcopy(X_second_half) mask = np.logical_and(X_second_half > .4, X_second_half < 1.) new_second_half[mask] *= 3. plt.plot(range(X_first_half.shape[0]), X_first_half, 'go') plt.plot(range(new_second_half.shape[0]), new_second_half, 'r^') plt.show() shift_detector.iteration(new_second_half) print(shift_detector.data_is_shifted())
def main(): """ """ resource_path = get_resource_path() folder = os.path.join(resource_path, 'data/amazon') datafile_train = 'Electronics_5.json' datafile_test = 'Books_5.json' X_train, y_train = [], [] X_test, y_test = [], [] for line in open(os.path.join(folder, datafile_train)): content = json.loads(line) X_train.append(content["reviewText"]) y_train.append(float(content["overall"])) for line in open(os.path.join(folder, datafile_test)): content = json.loads(line) X_test.append(content["reviewText"]) y_test.append(float(content["overall"])) size = 100 # pipeline = FullTextPipeline(RandomForest()) pipeline = HashingPipeline(RandomForest()) # pipeline = TfIdfPipeline(RandomForest()) model = pipeline.fit(X_train[:size], y_train[:size]) shift_detector = SklearnDataShiftDetector(model, n_bins=1000) shift_detector.iteration(X_train[:100]) shift_detector.iteration(X_test[:100]) print(shift_detector.data_is_shifted())
def test_typos_in_data_with_random_forest(self): classifier = RandomForest() model = self.pipeline.with_estimator(classifier).fit(self.X_train, self.y_train) error_generator = Typos() num_cols = 3 # Will fail if the number of columns is less than 3 columns = np.random.choice(self.features, num_cols, replace=False) corrupted_X_test = error_generator.run(self.X_test, columns=columns) # prediction = model.predict(X_test) # print(accuracy_score(y_test, prediction)) # suite = TestSuite() pipeline_profile = SklearnPipelineProfiler().on(model) tests, warnings = (self.automated_suite .with_profiles(self.data_profile, pipeline_profile) .on(corrupted_X_test)) for column, profile in zip(columns, self.data_profile.profiles): if profile.scale != DataScale.NOMINAL: continue self.assertIn(Test(Severity.CRITICAL).is_in_range(profile), tests) self.assertIn(Warning(ErrorType.NOT_IN_RANGE, Severity.CRITICAL, Message().not_in_range % (profile.column_name, str(profile.range))), warnings)
def main(): """ """ # data = credit.dataset_31_credit_g() data = wine.wine_quality_red_csv() print(data.shape) print(data.columns) target = "class" X, y = data[[col for col in data.columns if col != target]], data[target] X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=0) # pipeline = CreditGPipeline() pipeline = WineQualityPipeline() classifier = RandomForest(size=40) model = pipeline.with_estimator(classifier).fit(X_train, y_train) prediction = model.predict(X_test) print(accuracy_score(y_test, prediction)) suite = TestSuite() automated_suite = AutomatedTestSuite() data_profile = DataFrameProfiler().on(X_train) pipeline_profile = SklearnPipelineProfiler().on(model) suite.add(Test().is_complete( data_profile.for_column('volatile_acidity')).is_in_range( data_profile.for_column('alcohol'))) warnings = suite.on(X_test) print("*** TEST_SUITE, X_TEST") if warnings and (len(warnings) != 0): print("======= WARNINGS =======") for warn in warnings: print(warn) error_generator = ExplicitMissingValues() corrupted_X_test = error_generator.run(X_test, ['volatile_acidity']) warnings = suite.on(corrupted_X_test) print("*** TEST_SUITE, CORRUPTED_X_TEST") if warnings and (len(warnings) != 0): print("======= WARNINGS =======") for warn in warnings: print(warn) tests, warnings = (automated_suite.with_profiles( data_profile, pipeline_profile).run(corrupted_X_test)) print("*** AUTOMATED_TEST_SUITE, CORRUPTED_X_TEST") if warnings and (len(warnings) != 0): print("======= WARNINGS =======") for warn in warnings: print(warn)
def comparing_models(X_train, X_test, y_train, y_test): AdaBoost(X_train, X_test, y_train, y_test) Logistic_Regression(X_train, X_test, y_train, y_test) NaiveBayes(X_train, X_test, y_train, y_test) XGBoost(X_train, X_test, y_train, y_test) RandomForest(X_train, X_test, y_train, y_test) SVM(X_train, X_test, y_train, y_test) NeuralNetwork(X_train, X_test, y_train, y_test)
def main(): """ """ from pipelines import CreditGPipeline from models import RandomForest dataframe = pd.read_csv('resources/data/dataset_31_credit-g.csv') DataFrameProfiler().on(dataframe) pipeline = CreditGPipeline() SklearnPipelineProfiler().on(pipeline.with_estimator(RandomForest(40)))
def trainWithHotEncoding(hot_encoded_train_features, hot_encoded_train_labels, hot_encoded_test_features, hot_encoded_test_labels, results, algorithms, isTesting): # TRAIIN RANDOM FOREST class_weights = class_weight.compute_class_weight( 'balanced', np.unique(hot_encoded_train_labels.flatten()), hot_encoded_train_labels.flatten()) param_grid = { 'max_features': [ # None, # "sqrt", "log2" ], 'n_estimators': [ # 1000, 2000, # 3000 ] } if (isTesting): print("TESTING MODE RF: ONLY TRAINING 1 MODEL") param_grid = {} rf_hot_encoding = RandomForest(class_weights=class_weights, param_grid=param_grid) rf_hot_encoding.title = "RANDOM FOREST HOT ENCODING TRAIN" rf_hot_encoding.train(hot_encoded_train_features, hot_encoded_train_labels) model_name = 'RF-HOT.model' print("SAVING MODEL: ", model_name) try: joblib.dump(rf_hot_encoding.model, model_name) except Exception as e: print("Cannot save {} because: \n\n".format(model_name), str(e)) algorithms["RANDOM FOREST HOT ENCODING"] = rf_hot_encoding rf_hot_encoding.drawCurves(X=hot_encoded_train_features, y=hot_encoded_train_labels) rf_hot_encoding.title = "RANDOM FOREST HOT ENCODING TEST" results["RANDOM FOREST HOT ENCODING"] = rf_hot_encoding.drawCurves( X=hot_encoded_test_features, y=hot_encoded_test_labels) return rf_hot_encoding, results, algorithms
def trainWithFrecuencies(tetra_freq_train_features, tetra_freq_train_labels, tetra_freq_test_features, tetra_freq_test_labels, results, algorithms, isTesting): class_weights = class_weight.compute_class_weight( 'balanced', np.unique(tetra_freq_train_labels.flatten()), tetra_freq_train_labels.flatten()) param_grid = { 'max_features': [ # None, # "sqrt", "log2" ], 'n_estimators': [ # 1000, 2000, # 3000 ] } if (isTesting): print("TESTING MODE RF: ONLY TRAINING 1 MODEL") param_grid = {} rf = RandomForest(class_weights=class_weights, param_grid=param_grid) rf.title = "RANDOM FOREST TETRA NUCLEOTIDE FREQUENCY TRAIN" rf.train(tetra_freq_train_features, tetra_freq_train_labels) model_name = 'RF-TETRA.model' print("SAVING MODEL USING JOBLIB: ", model_name) try: joblib.dump(rf.model, model_name) except Exception as e: print("Cannot save {} because: \n\n".format(model_name), str(e)) algorithms["RANDOM FOREST"] = rf rf.drawCurves(X=tetra_freq_train_features, y=tetra_freq_train_labels) rf.title = "RANDOM FOREST TETRA NUCLEOTIDE FREQUENCY TEST" results["RANDOM FOREST"] = rf.drawCurves(X=tetra_freq_test_features, y=tetra_freq_test_labels) return rf, results, algorithms
def main(grid): # Get Clean Data X, Y = read_clean_data() # Linear Regression try: LinearRegression(X, Y, grid) except Exception as e: print(e) # Binarize Y Y_binary = BinaryY(Y) # Logistic Regression try: LogisticRegression(X, Y_binary, grid) except Exception as e: print(e) # Decision Tree try: DecisionTree(X, Y_binary, grid) except Exception as e: print(e) # Support Vector Machine try: SVM(X, Y_binary, grid) except Exception as e: print(e) # Random Forest try: RandomForest(X, Y_binary, grid) except Exception as e: print(e) # Bagging Classifier try: Bagging(X, Y_binary, grid) except Exception as e: print(e) # Neural Network try: NeuralNet(X, Y_binary, grid) except Exception as e: print(e)
def test_missing_values_in_data_with_random_forest(self): classifier = RandomForest() model = self.pipeline.with_estimator(classifier).fit(self.X_train, self.y_train) error_generator = ExplicitMissingValues() num_cols = 3 # Will fail if the number of columns is less than 3 columns = np.random.choice(self.features, num_cols, replace=False) corrupted_X_test = error_generator.run(self.X_test, columns=columns) # prediction = model.predict(X_test) # print(accuracy_score(y_test, prediction)) # suite = TestSuite() pipeline_profile = SklearnPipelineProfiler().on(model) tests, warnings = (self.automated_suite .with_profiles(self.data_profile, pipeline_profile) .on(corrupted_X_test)) for column, profile in zip(columns, self.data_profile.profiles): self.assertIn(Test(Severity.CRITICAL).is_complete(profile), tests) self.assertIn(Warning(ErrorType.MISSING_VALUE, Severity.CRITICAL, Message().not_complete % column), warnings)
def __init__(self): self.resource_folder = get_resource_path() # for dataset_name in sorted(os.listdir(folder)): # if dataset_name.endswith('.csv'): # print(dataset_name[:-4]) self.pipelines = { 'credit-g': ( 'credit-g/dataset_31_credit-g.csv', 'class', CreditGPipeline()), 'wine-quality': ( 'wine-quality/wine-quality-red.csv', 'class', WineQualityPipeline()), 'wq-missing': ( 'wine-quality/wine-quality-red.csv', 'class', WineQualityMissingPipeline()), 'abalone': ( 'abalone/abalone.csv', 'Rings', AbalonePipeline()), 'adult': ( 'adult/adult.csv', 'class', AdultPipeline()), 'adult-missing': ( 'adult/adult.csv', 'class', AdultMissingPipeline()), 'heart': ( 'heart/heart.csv', 'class', HeartPipeline())} self.classifiers = { 'dtc': DecisionTree(), 'rfc40': RandomForest(size=40), 'ertc40': ExtremelyRandomizedTrees(size=40), 'xgb': XGB(), 'svm': SVM(), 'lsvm': LinearSVM(), 'knn': KNN(n_neighbors=7), 'logreg': LogRegression(), 'gaus': GausNB(), 'brfc40': BaggingRandomForest(size=40), 'mlpc': MLPC(input_size=[16, 32, 16, 8]) } self.error_gens = { 'numeric anomalies': ( Anomalies(), lambda x: x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'typos': ( Typos(), lambda x: x.dtype == DataType.STRING), 'explicit misvals': ( ExplicitMissingValues(), lambda x: True), 'implicit misvals': ( ImplicitMissingValues(), lambda x: True), 'swap fields': ( SwapFields(), lambda x: True)} self.params = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8] self.tests = {'num disc': lambda x: (x.scale == DataScale.NOMINAL and x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'num cont': lambda x: (x.scale == DataScale.NOMINAL and x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'string': lambda x: x.dtype == DataType.STRING} self.results = Table(rows=sorted(self.pipelines.keys()), columns=sorted(self.classifiers.keys()), subrows=self.tests.keys(), subcolumns=self.error_gens.keys())
forecast_pr = model_pr.predict(start=model_pr.test.index[0], end=model_pr.test.index[-1]) eval_pr = EvaluateModel(model_pr.test, forecast_pr) print('RMSE for Prophet model: ', eval_pr.rmse()) eval_pr.plot(label='Prophet') # ====================================================================================================================== # Random Forest # ====================================================================================================================== FE = FeatureEngineering(DL) FE.generate_lags(features_to_lag=['Customers'], lags=[1, 12]) FE.split_features_target(type='train') model_rf = RandomForest(FE) model_rf.fit(trend='additive', seasonality=12) X_test, y_test = FE.split_features_target(type='test') X_test = X_test[model_rf.selected_features] forecast_rf = model_rf.predict(X_test) forecast_rf.set_index(keys=model_rf.test.index, inplace=True) eval_rf = EvaluateModel(model_rf.test[model_rf.target], forecast_rf) print('RMSE for Random Forest model: ', eval_rf.rmse()) eval_rf.plot(label='Random Forest') # ====================================================================================================================== # Extra-tree regressor # ====================================================================================================================== FE = FeatureEngineering(DL)
'[+] Loading last saved BaseLine class. Delete it if you want to train a new model!' ) curr_dir = os.path.dirname(os.path.abspath(__file__)) BaseLine_path = os.path.abspath(curr_dir + f"/utils/model/BaseLine.bcls") BaseLine_file = open(BaseLine_path, 'rb') model = pickle.load(BaseLine_file) else: logging.info('[+] Training on selected model...') if model == "sgd_classifier": model = _SGDClassifier(training_dataloader=training_dataloader) elif model == "naive_bayesian": model = NaiveBayesian(training_dataloader=training_dataloader) elif model == "support_vector_machine": model = SupportVectorMachine(training_dataloader=training_dataloader) elif model == "random_forest": model = RandomForest(training_dataloader=training_dataloader) elif model == "logistic_regression": model = _LogisticRegression(training_dataloader=training_dataloader) else: print("[?] Invalid Model!") sys.exit(1) model.train() # ------------ testing and statistical process based on pre-trained model # -------------------------------------------------------------------------- statistics = model.stat() print("\t- Accuracy : ", statistics["accuracy"]) print("\t- Precision : ", statistics["precision"]) print("\t- Recall : ", statistics["recall"]) print("\t- f1-score : ", statistics["f1_score"]) print("\t- ROC AUC score : ", statistics["roc_auc_score"])
test = { 'outlook': 'sunny', 'temp': 'hot', 'humidity': 'normal', 'windy': False } return df, target, test if __name__ == '__main__': X_tr, Y_tr, X_ts, Y_ts = load_data() # decision tree model = DecisionTree(n_attrs=4) model.fit(X_tr, Y_tr) y_pred = model.predict(X_ts) assert accuracy(y_pred, Y_ts) == 0.7837837837837838 # random forest model = RandomForest() model.fit(X_tr, Y_tr) y_pred = model.predict(X_ts) assert accuracy(y_pred, Y_ts) == 1 # decision tree categorical df, attr_targe, record_test = load_data_categorical() tree = DecicionTreeCategorical() tree.fit(df, attr_targe) assert tree.predict_one(record_test) == {'yes': 1.0}
y = model.predict(X) preds_to_lab(y, param['hop_size'], param['fs'], category, save_path, song_name) if __name__ == '__main__': parser = get_train_rf_parser() args = parser.parse_args(sys.argv[1:]) log.info('Arguments:\n' + pformat(args.__dict__)) # prepare train dataset params, y_size, y_ind = get_params_by_category(args.category) conv_root = args.conv_root if args.use_librosa: conv_root = conv_root + '/librosa/' else: conv_root = conv_root + '/mauch/' conv_list = args.conv_list if not conv_list: conv_list = gen_train_data(args.songs_list, args.audio_root, args.gt_root, params, conv_root, args.subsong_len, args.song_len, use_librosa=args.use_librosa) model = RandomForest(criterion=args.criterion, max_features=args.max_features, n_estimators=args.n_estimators) model = train_rf(model, conv_list)