Пример #1
0
def model_training(Y_train, output_path, training_start_date,
                   training_end_date, chain_len):
    X_train = loadX(training_start_date, training_end_date)
    X_train = dataFillNA(X_train)  # fill na
    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')
    X_train = all_data[tmp_columns]
    Y_train = all_data['Y']
    del all_data
    gc.collect()

    X_train = Xpoint2Set(X_train, chain_len)
    y_train = Ypoint2Set(Y_train, chain_len)

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train, y_train)

    with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo:  # dump model
        pickle.dump(crf, tmp_fo)
Пример #2
0
def model_training(Y_train, output_path, training_start_date,
                   training_end_date, chain_len):
    X_train = loadX(training_start_date, training_end_date)
    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')

    cols_to_process = ['OUTSTANDING_CASH_TO_FREE_CAP']
    all_data, tot_bin_cols, tot_cutoff_points = getDiscreteFeatures(
        all_data, cols_to_process)
    tmp_columns = [x for x in tmp_columns if x not in cols_to_process]
    tmp_columns = tmp_columns + tot_bin_cols

    X_train = all_data[tmp_columns]
    Y_train = all_data['Y']
    del all_data
    gc.collect()

    X_train = Xpoint2Set(X_train, chain_len)
    y_train = Ypoint2Set(Y_train, chain_len)

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train, y_train)

    with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo:  # dump model
        pickle.dump(crf, tmp_fo)
Пример #3
0
def model_testing(Y_test, output_path, testing_start_date, testing_end_date,
                  chain_len):
    X_test = loadX(testing_start_date, testing_end_date)
    X_test = dataFillNA(X_test)  # fill na
    tmp_columns = X_test.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_test.merge(Y_test, on='date', how='inner')
    X_test = all_data[tmp_columns]
    Y_test = all_data['Y']
    test_dates = all_data['date']
    del all_data
    gc.collect()

    X_test = Xpoint2Set(X_test, chain_len)
    Y_test_pair = Ypoint2Set(Y_test, chain_len)

    with open(output_path + 'crf_model.pkl', 'rb') as tmp_fi:  # dump model
        crf = pickle.load(tmp_fi)

    y_pred = crf.predict(X_test)

    # test pair
    labels = ['-1.0', '1.0']
    print(
        metrics.flat_classification_report(Y_test_pair,
                                           y_pred,
                                           labels=labels,
                                           digits=3))

    # test single
    y_pred_single = y_pred[0].copy()
    y_pred_single.pop(-1)
    y_pred_single.extend([tmp_y[1] for tmp_y in y_pred])
    # y_pred_single.insert(0, y_pred[0][0])
    y_real_singel = Y_test.astype('str').tolist()
    prsc = precision_score(y_real_singel,
                           y_pred_single,
                           labels=labels,
                           average='micro')
    print('%s to %s weighted precision: %f' %
          (testing_start_date, testing_end_date, prsc))
    print('f1 score: %f, precision: %f' %
          (metrics.flat_f1_score(
              Y_test_pair, y_pred, labels=labels, average='weighted'),
           metrics.flat_precision_score(
               Y_test_pair, y_pred, labels=labels, average='micro')))

    prediction = pd.DataFrame(test_dates)
    prediction.loc[:, 'predict'] = y_pred_single

    return prediction, prsc
def model_training(y_train, output_path, training_start_date,
                   training_end_date):
    global X_train, Y_train
    X_train = loadX(training_start_date, training_end_date)
    X_train = dataFillNA(X_train)  # fill na
    Y_train = y_train

    # ==== hyperopt validation
    # params = {
    #     # 'chg_pct': hp.uniform('chg_pct', 0, 0.3),
    #     # 'chg_threshold': hp.uniform('chg_threshold', 0, 0.3),
    #     # 'chain_len': hp.randint('chain_len', 9),
    #     'training_start_date': training_start_date,
    #     'training_end_date': training_end_date
    # }

    # chg_pct = scipy.stats.uniform(scale=0.3)
    # chg_threshold = scipy.stats.uniform(scale=0.3)
    # chain_len = scipy.stats.randint(low=2, high =10)

    # ==== cross validation
    best_cv_score = 0
    for tmp_chain_len in range(2, 11):  # chain 2~10
        # tmp_chg_pct = chg_pct.rvs()
        # tmp_chg_threshold = chg_threshold.rvs()
        # tmp_chain_len = chain_len.rvs()
        # params['chain_len'] = tmp_chain_len
        tmp_results = objective(tmp_chain_len)
        if tmp_results['cv_score'] > best_cv_score:
            best_cv_score = tmp_results['cv_score']
            # tmp_sub_params['chg_pct'] = tmp_chg_pct
            # tmp_sub_params['chg_threshold'] = tmp_chg_threshold
            best_params = tmp_results.copy()
            best_params['chain_len'] = tmp_chain_len
    # tmp_trial = Trials()
    # best_params = fmin(objective, space=params, algo=tpe.suggest, max_evals=100, trials=tmp_trial)

    # # get sub-params
    # tmp_idx = np.argmin(np.array(tmp_trial.losses()))
    # best_params['c1'] = tmp_trial.results[tmp_idx]['c1']
    # best_params['c2'] = tmp_trial.results[tmp_idx]['c2']
    # best_params['chain_len'] += 2  # adjust chain len

    print('best cv score:', best_params['cv_score'])
    print('best params:', best_params)

    # ==== train with the best params
    # Y_train = loadY(best_params['chg_pct'], best_params['chg_threshold'], training_start_date, training_end_date)
    # Y_train.loc[:, 'Y'] = Y_train['PeakTrough'].shift(-1)  # predict tomorrow !!!!
    # Y_train = Y_train.loc[~Y_train['Y'].isnull()]  # drop nan

    tmp_columns = X_train.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_train.merge(Y_train, on='date', how='inner')
    chain_X_train = all_data[tmp_columns]
    chain_Y_train = all_data['Y']

    chain_X_train = Xpoint2Set(chain_X_train, best_params['chain_len'])
    chain_Y_train = Ypoint2Set(chain_Y_train, best_params['chain_len'])

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=best_params['c1'],
                               c2=best_params['c2'],
                               max_iterations=100,
                               all_possible_transitions=True)

    crf.fit(chain_X_train, chain_Y_train)

    # tmp_columns = X_train.columns.tolist()
    # tmp_columns.remove('date')
    #
    # all_data = X_train.merge(Y_train, on='date', how='inner')
    # X_train = all_data[tmp_columns]
    # Y_train = all_data['Y']
    # del all_data
    # gc.collect()
    #
    # X_train = Xpoint2Set(X_train, chain_len)
    # Y_train = Ypoint2Set(Y_train, chain_len)
    #
    # # search parameter by cross validation
    # crf = sklearn_crfsuite.CRF(
    #     algorithm='lbfgs',
    #     # c1=0.1,
    #     # c2=0.1,
    #     max_iterations=100,
    #     all_possible_transitions=True
    # )
    #
    # params_space = {
    #     'c1': scipy.stats.expon(scale=0.5),
    #     'c2': scipy.stats.expon(scale=0.05),
    # }
    #
    # labels = ['-1.0', '1.0']
    # # val_scorer = make_scorer(precision_score, average='micro', labels=labels)
    # val_scorer = make_scorer(metrics.flat_precision_score, average='micro', labels=labels)
    #
    # rs_cv = RandomizedSearchCV(crf, params_space, cv=3, verbose=10, n_jobs=-1, n_iter=50, scoring=val_scorer)  # searching
    # rs_cv.fit(X_train, Y_train)
    #
    # crf = rs_cv.best_estimator_
    # # crf.fit(X_train, y_train)

    with open(output_path + 'crf_model.pkl', 'wb') as tmp_fo:  # dump model
        pickle.dump(crf, tmp_fo)

    return best_params