def __init__(self): """ Constructor """ # preprocessor instance self.__pre_process = PreProcessor() self.__train, self.__y_train = self.__pre_process.get_train_data() # Tuning Parameters self.__n_folds = 3 # Cross-validation with k-folds # Models self.__lasso = make_pipeline( RobustScaler(), Lasso(alpha=0.0005, random_state=1)) self.__ENet = make_pipeline(RobustScaler(), ElasticNet( alpha=0.0005, l1_ratio=.9, random_state=3)) self.__KRR = KernelRidge( alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) self.__GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) self.__model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0, learning_rate=0.05, max_depth=6, min_child_weight=1.5, n_estimators=7200, reg_alpha=0.9, reg_lambda=0.6, subsample=0.2, seed=42, silent=1, random_state=7)
def test_bptt(): file_path = "\\data\\reddit.csv" preprocessor = PreProcessor() preprocessor.process_word_index(file_path) X_train = preprocessor.X_train[10] Y_train = preprocessor.Y_train[10] vocabulary_size = BaseConfig.vocabulary_size np.random.seed(10) rnn = SimpleRNN(vocabulary_size) print rnn.bptt(X_train, Y_train)
def worker(X_train, y_train, X_test, y_test, pm, rs): p = PreProcessor(stop_words=pm[0], tf=pm[1], idf=pm[2], scale=pm[3]) print 'Test', p.get_name() m = SGDClassifier() m.fit(p.fit_training(X_train), y_train) y_pred = m.predict(p.fit_test(X_test)) f1 = metrics.f1_score(y_test, y_pred) rs[float(f1)] = p.get_name()
def __init__(self): """ Constructor """ # preprocessor instance self.__pre_process = PreProcessor() self.__train_data, self.__train_targets = self.__pre_process.get_train_data( ) # print(self.__train_data) # Tuning Parameters self.__n_folds = 5 # Cross-validation with k-folds self.__num_epochs = 400
def __init__(self): """ constructor """ self.__pre_processor = PreProcessor() self.__trainer = Trainer() self.__predictor = Predictor()
class TestPreprocess(unittest.TestCase): def setUp(self): self.pp = PreProcessor() self.pp.parse_file("../data/samples.txt") def test_file_parser(self): """ Test routine for file parser """ count = self.pp.trans_count self.assertEqual(count, 10, "Sample file size must be 10") print("PreProcessor::file_parser") def test_unique_counts(self): """ Test unique field counters """ uq = self.pp.unique # self.assertEqual(uq['RACE_IS_HISP_RC'], 1, "Must be equal") # self.assertEqual(uq['RACE_IS_BLACK'], 1, "Must be equal") self.assertEqual(uq['RACE_IS_WHITE'], 8, "Must be equal") self.assertEqual(uq['SEX_IS_FEMALE'], 3, "Must be equal") self.assertEqual(uq['SEX_IS_MALE'], 7, "Must be equal") print("PreProcessor.unique_counts") def test_mapping(self): """ Test discretize/binarize here """ import collections self.pp._print_transactions() trans = self.pp.get_transactions() t1 = trans[0] is_others = self.pp.mapper.race['OTHERS'] != None self.assertEqual(t1['ID'], 1, "Must be first transaction") if is_others: self.assertEqual( t1['ITEMS'], collections.OrderedDict([('RACE_IS_OTHERS', True), ('SCORE_IS_[44-57]', True), ('SEX_IS_MALE', True)]), "Must be first transaction") else: self.assertEqual( t1['ITEMS'], collections.OrderedDict([('RACE_IS_HISP_RC', True), ('SCORE_IS_[44-57]', True), ('SEX_IS_MALE', True)]), "Must be first transaction") print("PreProcessor::mappers")
def main(): pre_process = PreProcessor() X, y = pre_process.get_train_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42, stratify=y) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) xgb_params = { 'objective': 'binary:logistic', 'eval_metric': 'logloss', } # 学習時に用いる検証用データ evals = [(dtrain, 'train'), (dtest, 'eval')] # 学習過程を記録するための辞書 evals_result = {} bst = xgb.train(xgb_params, dtrain, num_boost_round=1000, # ラウンド数を増やしておく evals=evals, evals_result=evals_result, ) y_pred_proba = bst.predict(dtest) y_pred = np.where(y_pred_proba > 0.5, 1, 0) acc = accuracy_score(y_test, y_pred) print('Accuracy:', acc) # 学習の課程を折れ線グラフとしてプロットする train_metric = evals_result['train']['logloss'] plt.plot(train_metric, label='train logloss') eval_metric = evals_result['eval']['logloss'] plt.plot(eval_metric, label='eval logloss') plt.grid() plt.legend() plt.xlabel('rounds') plt.ylabel('logloss') plt.show()
def setUp(self): pp = PreProcessor() pp.parse_file("../data/samples.txt") transactions = pp.get_transactions() uniques = pp.get_uniques() sup = 2.0 conf = 0.374 self.apriori = Apriori(transactions, uniques, sup, conf)
class Predictor(object): """ Predict処理を実行する """ def __init__(self): """ constructor """ # preprocessor instance self.__pre_process = PreProcessor() def predict(self, model): """ Predict and write data for submit :param model: :return: """ test = self.__pre_process.get_test_data() predict_data = model.predict(test) return predict_data @staticmethod def write_file_submit(predict_data): """ Write predict data to submit file :param predict_data: :return: """ submit = pd.read_csv("./Data/sample_submit.csv", header=None) submit[1] = predict_data print(submit[1]) now = datetime.datetime.now() now_str = '{}_{}_{}_{}_{}'.format(now.year, now.month, now.day, now.hour, now.minute) submit_file = './Data/submit/submit_{}.csv'.format(now_str) submit.to_csv(submit_file, header=None, index=None) @staticmethod def essemble_results(file1, file2): data1 = pd.read_csv(file1, sep="\t", header=None) data2 = pd.read_csv(file2, sep="\t", header=None) predict_data = (np.array(data1[1]) + np.array(data2[1])) / 2 return predict_data
from timeit import default_timer as timer import matplotlib.pyplot as plt import matplotlib.patches as patches from ModelLoader import ModelLoader from PostProcessor import PostProcessor from preprocess import PreProcessor import json model_loader = ModelLoader() img = Image.open('/home/bilal/Downloads/foto_van_yosra1.jpg') # print("---------------------------------") # print("0 for tinyYolo") # val = input("give your neural network architecture type: ") preprocessor = PreProcessor(0, img) img_data = preprocessor.preprocess() # print("---------------------------------------------") # load a simple model session = model_loader.load_session(1) begin = timer() # see the input name and shape input_name = session.get_inputs()[0].name """# print("input name = ", input_name) input_shape = session.get_inputs()[0].shape # print("input shape =",input_shape) input_type = session.get_inputs()[0].type
def setUp(self): self.pp = PreProcessor() self.pp.parse_file("../data/samples.txt")
from preprocess import PreProcessor import os import facenet import numpy as np raw_img_folder = './data/raw_img' train_img_folder = './data/train_img' pp = PreProcessor() # process all sub folders(people) under raw image folder def processAllFolder(): dataset = facenet.get_dataset(raw_img_folder) # looping the subfolders for all people for subfolder in dataset: output_class_dir = os.path.join(train_img_folder, subfolder.name) # create training image folder if not exists if not os.path.exists(output_class_dir): os.makedirs(output_class_dir) # align all images for each person and save in train_img folder num_image = pp.align_dir(subfolder, output_class_dir) print("Aligned %s images from %s folder" % (num_image, subfolder.name)) print("Image Preprocess All Done!") # make sure folder is placed under raw_image_folder # E.g processOneFolder('Jason Jia') def processOneFolder(folder_name):
tf = bool(row[3]) idf = bool(row[4]) scale = bool(row[5]) params[category] = [stop_words, tf, idf, scale] with open(RESULT, 'w') as out: rows = csv.writer(out) for category in categories: source = os.path.join(TRAINING_CATEGORIES, category) if os.path.exists(source): X_train, y_train = load(source, category) pms = params[category] p = PreProcessor(stop_words=pms[0], tf=pms[1], idf=pms[2], scale=pms[3]) X_train = p.fit_training(X_train) print '.fit()' m = SGDClassifier() m.fit(X_train, y_train) print '.predict()' for test_id, text in test_entries(): text = p.fit_test(np.array([text])) result = [category, test_id, m.predict(text)[0]] rows.writerow(result)
from preprocess import PreProcessor import numpy as np pre_processor = PreProcessor() def load_data(n=0): tranning_sets, validation_sets, test_sets = pre_processor.read_dataset() train_set = [i for i in tranning_sets[n]] validation_set = [i for i in validation_sets[n]] test_set = [i for i in test_sets] train_data = [i[0] for i in train_set] train_label = [i[1] for i in train_set] validation_data = [i[0] for i in validation_set] validation_label = [i[1] for i in validation_set] test_data = [i[0] for i in test_set] test_label = [i[1] for i in test_set] # transform to np array train_data = np.array(train_data) train_label = np.array(train_label) validation_data = np.array(validation_data) validation_label = np.array(validation_label) test_data = np.array(test_data) test_label = np.array(test_label) return ((train_data, train_label), (validation_data, validation_label), (test_data, test_label))
def __init__(self): """ constructor """ # preprocessor instance self.__pre_process = PreProcessor()
parser = argparse.ArgumentParser() parser.add_argument('-data', help='location of dataset', default='data/out_split.pk') parser.add_argument('-We', help='location of word embeddings', default='data/glove.6B.300d.txt') parser.add_argument('-model', help='model to run: nbow or dan', default='nbow') parser.add_argument('-wd', help='use word dropout or not', default='y') args = vars(parser.parse_args()) pp = PreProcessor(args['data'], args['We']) pp.tokenize() data, labels, data_val, labels_val = pp.make_data() embedding_matrix = pp.get_word_embedding_matrix(embedding_dim) model = Sequential() if args['We'] == "rand": model.add( Embedding(len(pp.word_index) + 1, embedding_dim, input_length=pp.MAX_SEQUENCE_LENGTH, trainable=False)) else: model.add(
class Trainer(object): """ Train Class """ def __init__(self): """ Constructor """ # preprocessor instance self.__pre_process = PreProcessor() self.__train_data, self.__train_targets = self.__pre_process.get_train_data( ) # print(self.__train_data) # Tuning Parameters self.__n_folds = 5 # Cross-validation with k-folds self.__num_epochs = 400 def build_model(self): """ モデル構築 :return: """ # NN model model = models.Sequential() model.add( layers.Dense(256, activation='relu', kernel_initializer='normal', input_shape=(self.__train_data.shape[1], ))) model.add( layers.Dense(256, activation='relu', kernel_initializer='normal')) model.add( layers.Dense(256, activation='relu', kernel_initializer='normal')) model.add( layers.Dense(1, kernel_initializer='normal', activation='linear')) model.compile(optimizer='adam', loss="mse", metrics=['mape']) model.summary() return model def fit_model(self): """ モデルをFitする :return: """ # Kerasモデル構築(コンパイル済) model = self.build_model() # モデルをサイレントモード(verbose=0)で適合 model.fit(self.__train_data, self.__train_targets, epochs=self.__num_epochs, batch_size=16, verbose=0) return model def evaluate_cross(self): """ 交差評価 :return: """ all_scores = [] num_val_samples = int(len(self.__train_data) / self.__n_folds) for i in range(self.__n_folds): print('processing fold # {}'.format(i)) # 検証データの準備 val_data = self.__train_data[i * num_val_samples:(i + 1) * num_val_samples] val_targets = self.__train_targets[i * num_val_samples:(i + 1) * num_val_samples] # 訓練データの準備 partial_train_data = np.concatenate([ self.__train_data[:i * num_val_samples], self.__train_data[(i + 1) * num_val_samples:] ], axis=0) partial_targets_data = np.concatenate([ self.__train_targets[:i * num_val_samples], self.__train_targets[(i + 1) * num_val_samples:] ], axis=0) # Kerasモデル構築(コンパイル済) model = self.build_model() # モデルをサイレントモード(verbose=0)で適合 model.fit(partial_train_data, partial_targets_data, epochs=self.__num_epochs, batch_size=16, verbose=0) # モデルを検証データで評価 val_mse, val_mape = model.evaluate(val_data, val_targets, verbose=0) all_scores.append(val_mape) print(all_scores) return np.mean(all_scores) def visualize_k_folds(self): """ k分割交差検証のvisualization :return: """ all_mape_histories = [] num_val_samples = int(len(self.__train_data) / self.__n_folds) for i in range(self.__n_folds): print('processing fold # {}'.format(i)) # 検証データの準備 val_data = self.__train_data[i * num_val_samples:(i + 1) * num_val_samples] val_targets = self.__train_targets[i * num_val_samples:(i + 1) * num_val_samples] # 訓練データの準備 partial_train_data = np.concatenate([ self.__train_data[:i * num_val_samples], self.__train_data[(i + 1) * num_val_samples:] ], axis=0) partial_targets_data = np.concatenate([ self.__train_targets[:i * num_val_samples], self.__train_targets[(i + 1) * num_val_samples:] ], axis=0) # Kerasモデル構築(コンパイル済) model = self.build_model() # モデルをサイレントモード(verbose=0)で適合 history = model.fit(partial_train_data, partial_targets_data, validation_data=(val_data, val_targets), epochs=self.__num_epochs, batch_size=16, verbose=0) # モデルを検証データで評価 mape_history = history.history[ 'val_mean_absolute_percentage_error'] all_mape_histories.append(mape_history) print(all_mape_histories) average_mape_history = [ np.mean([x[i] for x in all_mape_histories]) for i in range(self.__num_epochs) ] plt.plot(range(1, len(average_mape_history) + 1), average_mape_history) plt.xlabel('Epochs') plt.ylabel('Validation MAPE') plt.show()
class Trainer(object): """ Train Class """ def __init__(self): """ Constructor """ # preprocessor instance self.__pre_process = PreProcessor() self.__train, self.__y_train = self.__pre_process.get_train_data() # Tuning Parameters self.__n_folds = 3 # Cross-validation with k-folds # Models self.__lasso = make_pipeline( RobustScaler(), Lasso(alpha=0.0005, random_state=1)) self.__ENet = make_pipeline(RobustScaler(), ElasticNet( alpha=0.0005, l1_ratio=.9, random_state=3)) self.__KRR = KernelRidge( alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) self.__GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) self.__model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0, learning_rate=0.05, max_depth=6, min_child_weight=1.5, n_estimators=7200, reg_alpha=0.9, reg_lambda=0.6, subsample=0.2, seed=42, silent=1, random_state=7) # self.__model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, # learning_rate=0.05, n_estimators=720, # max_bin=55, bagging_fraction=0.8, # bagging_freq=5, feature_fraction=0.2319, # feature_fraction_seed=9, bagging_seed=9, # min_data_in_leaf=6, min_sum_hessian_in_leaf=11) def get_scores(self): """ 学習関数 :return: """ score = self.rmsle_cv(self.__lasso) print("\nLasso score: {:.4f} ({:.4f})\n".format( score.mean(), score.std())) score = self.rmsle_cv(self.__ENet) print("ElasticNet score: {:.4f} ({:.4f})\n".format( score.mean(), score.std())) score = self.rmsle_cv(self.__KRR) print("Kernel Ridge score: {:.4f} ({:.4f})\n".format( score.mean(), score.std())) score = self.rmsle_cv(self.__GBoost) print("Gradient Boosting score: {:.4f} ({:.4f})\n".format( score.mean(), score.std())) score = self.rmsle_cv(self.__model_xgb) print("Xgboost score: {:.4f} ({:.4f})\n".format( score.mean(), score.std())) # score = self.rmsle_cv(self.__model_lgb) # print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) def mean_absolute_percentage_error(self, y_true, y_pred): y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 def adaboost(self): regr = AdaBoostRegressor(random_state=0, n_estimators=100) regr.fit(self.__train,self.__y_train) score = regr.score(self.__train,self.__y_train) print(score) return regr def fit_model(self): """ モデルをフィットする :return: """ # model = self.train_model(self.__train, self.__y_train) test_size = 1/self.__n_folds # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(self.__train, self.__y_train, test_size = test_size, random_state=0) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10) # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, # 0.3, 0.6, 1], cv=10) # lasso = ElasticNetCV(cv=10, random_state=0) # lasso.fit(x_train_split, y_train_split) # y_predicted = lasso.predict(X=x_test_split) # mape = self.mean_absolute_percentage_error(y_test_split,y_predicted) # print(mape) # xgboostモデルの作成 reg = xgb.XGBRegressor() # ハイパーパラメータ探索 reg_cv = GridSearchCV(reg, {'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, verbose=1) reg_cv.fit(x_train_split, y_train_split) print(reg_cv.best_params_, reg_cv.best_score_) # 改めて最適パラメータで学習 reg = xgb.XGBRegressor(**reg_cv.best_params_) reg.fit(x_train_split, y_train_split) # 学習モデルの保存、読み込み # import pickle # pickle.dump(reg, open("model.pkl", "wb")) # reg = pickle.load(open("model.pkl", "rb")) # 学習モデルの評価 pred_train = reg.predict(x_train_split) pred_test = reg.predict(x_test_split) # print(self.mean_absolute_percentage_error(y_train_split, pred_train)) print(self.mean_absolute_percentage_error(y_test_split, pred_test)) # import pandas as pd # import matplotlib.pyplot as plt # importances = pd.Series(reg.feature_importances_, index = boston.feature_names) # importances = importances.sort_values() # importances.plot(kind = "barh") # plt.title("imporance in the xgboost Model") # plt.show() return reg def train_model(self, X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0) # Create a decision tree regressor object regressor = DecisionTreeRegressor() # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} # Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(self.r2_score) # Create the grid search cv object --> GridSearchCV() grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_ def rmsle_cv(self, model): """ calculate rmse for cross validation :return: """ kf = KFold(self.__n_folds, shuffle=True, random_state=42).get_n_splits(self.__train.values) rmse = np.sqrt(-cross_val_score(model, self.__train.values, self.__y_train, scoring="neg_mean_squared_error", cv=kf)) return rmse @staticmethod def r2_score(y_true, y_predict): """ Calculates and returns the performance score between true (y_true) and predicted (y_predict) values based on the metric chosen. """ score = r2_score(y_true, y_predict) # Return the score return score