def model_training(Y_train, output_path, training_start_date, training_end_date, chain_len): X_train = loadX(training_start_date, training_end_date) X_train = dataFillNA(X_train) # fill na tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') X_train = all_data[tmp_columns] Y_train = all_data['Y'] del all_data gc.collect() X_train = Xpoint2Set(X_train, chain_len) y_train = Ypoint2Set(Y_train, chain_len) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo: # dump model pickle.dump(crf, tmp_fo)
def model_training(Y_train, output_path, training_start_date, training_end_date, chain_len): X_train = loadX(training_start_date, training_end_date) tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') cols_to_process = ['OUTSTANDING_CASH_TO_FREE_CAP'] all_data, tot_bin_cols, tot_cutoff_points = getDiscreteFeatures( all_data, cols_to_process) tmp_columns = [x for x in tmp_columns if x not in cols_to_process] tmp_columns = tmp_columns + tot_bin_cols X_train = all_data[tmp_columns] Y_train = all_data['Y'] del all_data gc.collect() X_train = Xpoint2Set(X_train, chain_len) y_train = Ypoint2Set(Y_train, chain_len) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo: # dump model pickle.dump(crf, tmp_fo)
def model_testing(Y_test, output_path, testing_start_date, testing_end_date, chain_len): X_test = loadX(testing_start_date, testing_end_date) X_test = dataFillNA(X_test) # fill na tmp_columns = X_test.columns.tolist() tmp_columns.remove('date') all_data = X_test.merge(Y_test, on='date', how='inner') X_test = all_data[tmp_columns] Y_test = all_data['Y'] test_dates = all_data['date'] del all_data gc.collect() X_test = Xpoint2Set(X_test, chain_len) Y_test_pair = Ypoint2Set(Y_test, chain_len) with open(output_path + 'crf_model.pkl', 'rb') as tmp_fi: # dump model crf = pickle.load(tmp_fi) y_pred = crf.predict(X_test) # test pair labels = ['-1.0', '1.0'] print( metrics.flat_classification_report(Y_test_pair, y_pred, labels=labels, digits=3)) # test single y_pred_single = y_pred[0].copy() y_pred_single.pop(-1) y_pred_single.extend([tmp_y[1] for tmp_y in y_pred]) # y_pred_single.insert(0, y_pred[0][0]) y_real_singel = Y_test.astype('str').tolist() prsc = precision_score(y_real_singel, y_pred_single, labels=labels, average='micro') print('%s to %s weighted precision: %f' % (testing_start_date, testing_end_date, prsc)) print('f1 score: %f, precision: %f' % (metrics.flat_f1_score( Y_test_pair, y_pred, labels=labels, average='weighted'), metrics.flat_precision_score( Y_test_pair, y_pred, labels=labels, average='micro'))) prediction = pd.DataFrame(test_dates) prediction.loc[:, 'predict'] = y_pred_single return prediction, prsc
def model_training(y_train, output_path, training_start_date, training_end_date): global X_train, Y_train X_train = loadX(training_start_date, training_end_date) X_train = dataFillNA(X_train) # fill na Y_train = y_train # ==== hyperopt validation # params = { # # 'chg_pct': hp.uniform('chg_pct', 0, 0.3), # # 'chg_threshold': hp.uniform('chg_threshold', 0, 0.3), # # 'chain_len': hp.randint('chain_len', 9), # 'training_start_date': training_start_date, # 'training_end_date': training_end_date # } # chg_pct = scipy.stats.uniform(scale=0.3) # chg_threshold = scipy.stats.uniform(scale=0.3) # chain_len = scipy.stats.randint(low=2, high =10) # ==== cross validation best_cv_score = 0 for tmp_chain_len in range(2, 11): # chain 2~10 # tmp_chg_pct = chg_pct.rvs() # tmp_chg_threshold = chg_threshold.rvs() # tmp_chain_len = chain_len.rvs() # params['chain_len'] = tmp_chain_len tmp_results = objective(tmp_chain_len) if tmp_results['cv_score'] > best_cv_score: best_cv_score = tmp_results['cv_score'] # tmp_sub_params['chg_pct'] = tmp_chg_pct # tmp_sub_params['chg_threshold'] = tmp_chg_threshold best_params = tmp_results.copy() best_params['chain_len'] = tmp_chain_len # tmp_trial = Trials() # best_params = fmin(objective, space=params, algo=tpe.suggest, max_evals=100, trials=tmp_trial) # # get sub-params # tmp_idx = np.argmin(np.array(tmp_trial.losses())) # best_params['c1'] = tmp_trial.results[tmp_idx]['c1'] # best_params['c2'] = tmp_trial.results[tmp_idx]['c2'] # best_params['chain_len'] += 2 # adjust chain len print('best cv score:', best_params['cv_score']) print('best params:', best_params) # ==== train with the best params # Y_train = loadY(best_params['chg_pct'], best_params['chg_threshold'], training_start_date, training_end_date) # Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1) # predict tomorrow !!!! # Y_train = Y_train.loc[~Y_train['Y'].isnull()] # drop nan tmp_columns = X_train.columns.tolist() tmp_columns.remove('date') all_data = X_train.merge(Y_train, on='date', how='inner') chain_X_train = all_data[tmp_columns] chain_Y_train = all_data['Y'] chain_X_train = Xpoint2Set(chain_X_train, best_params['chain_len']) chain_Y_train = Ypoint2Set(chain_Y_train, best_params['chain_len']) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=best_params['c1'], c2=best_params['c2'], max_iterations=100, all_possible_transitions=True) crf.fit(chain_X_train, chain_Y_train) # tmp_columns = X_train.columns.tolist() # tmp_columns.remove('date') # # all_data = X_train.merge(Y_train, on='date', how='inner') # X_train = all_data[tmp_columns] # Y_train = all_data['Y'] # del all_data # gc.collect() # # X_train = Xpoint2Set(X_train, chain_len) # Y_train = Ypoint2Set(Y_train, chain_len) # # # search parameter by cross validation # crf = sklearn_crfsuite.CRF( # algorithm='lbfgs', # # c1=0.1, # # c2=0.1, # max_iterations=100, # all_possible_transitions=True # ) # # params_space = { # 'c1': scipy.stats.expon(scale=0.5), # 'c2': scipy.stats.expon(scale=0.05), # } # # labels = ['-1.0', '1.0'] # # val_scorer = make_scorer(precision_score, average='micro', labels=labels) # val_scorer = make_scorer(metrics.flat_precision_score, average='micro', labels=labels) # # rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=10, n_jobs=-1, n_iter=50, scoring=val_scorer) # searching # rs_cv.fit(X_train, Y_train) # # crf = rs_cv.best_estimator_ # # crf.fit(X_train, y_train) with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo: # dump model pickle.dump(crf, tmp_fo) return best_params