Пример #1
0
def MKL():
    fname, pv, tv, org_metrics = experiment_setting()
    print(fname, pv, tv)

    list_pair_metrics = [["l1", "l2"]]

    for metrics in list_pair_metrics:
        X, y, sim_matrices = get_s_metric(fname=fname,
                                          tv=tv,
                                          pv=pv,
                                          metrics=metrics)

        # # from similarity to kernel matrix
        KL = [np.exp(s) / 0.01 for s in sim_matrices]
        KL_norm = [kernel_normalization(K) for K in KL]
        print(KL_norm, sim_matrices)

    # KLtr, KLte, Ytr, Yte = train_test_split(KL, Y, random_state=42, shuffle=True, test_size=.3)
    print(y)

    # # polynomial kernel
    # KL_norm = [hpk(X, degree=d) for d in range(1,11)]

    gamma_values = [0.001, 0.01, 0.1, 1, 10]

    lam_values = [0, 0.1, 0.2, 1]
    C_values = [0.01, 1, 100]
    # for lam in lam_values:
    # 	for gamma, C in product(gamma_values, C_values):
    # 	    svm = SVR(kernel="rbf", C=C, gamma=gamma)
    # 	    mkl = EasyMKL(lam=lam, learner=svm)
    # 	    scores = cross_val_score(KL_norm, y, mkl, n_folds=3, scoring='mae')
    # 	    print (lam, C, scores)

    for lam, C in product(lam_values, C_values):
        svm = SVC(C=C)
        mkl = EasyMKL(lam=lam, learner=svm)
        # # add into MKL sources
        scores = cross_val_score(KL_norm, y, mkl, n_folds=3, scoring='mae')
        print(lam, C, scores)
Пример #2
0
# print(base_learner)
###########################################################################################
best_results = {}

for lam in [0, 0.0001, 0.0009, 0.001, 0.009, 0.01, 0.09, 0.1, 0.2, 0.9, 1]:
    base_learner = GridSearchCV(svm.SVC(probability=True),
                                param_grid=param_grid,
                                cv=cv,
                                refit='AUC',
                                error_score=0,
                                pre_dispatch='1*n_jobs',
                                n_jobs=1)
    scores = cross_val_score(k1,
                             y_tr_A,
                             EasyMKL(learner=base_learner, lam=lam),
                             cv=cv,
                             n_folds=5,
                             scoring='accuracy')
    # print(lam, scores)
    acc = np.mean(scores)
    if not best_results or best_results['score'] < acc:
        best_results = {'lam': lam, 'score': acc}

# EasyMKL-BASED
#############################################################################################
clf = EasyMKL(learner=base_learner,
              lam=best_results['lam']).fit(k1 + k2 + k3 + k4 + k5 + k6, y_tr_A)
print(clf)
#############################################################################################
# evaluate the solution
from sklearn.metrics import accuracy_score, roc_auc_score
Пример #3
0
    def parallelised_function(file):
        select_file_path = os.path.join(jointFeatureLocation,
                                        file)  # formulate the path
        print('Symbol:----->', file.split("_")[0])
        symbol = file.split("_")[0]

        select_hmm_date = select_file_path.split("_")[
            3]  # pull out the hmm_date - strip it out

        select_feature_label_date = select_file_path.split("_")[
            6]  # pull out the label_feature_date

        select_label_idx = select_file_path.split("_")[
            9]  # pull out the label _idx

        unpickled_select_file = open_pickle_filepath(
            select_file_path)  # unplickle the select file

        hmm_keys = sorted(list(
            unpickled_select_file.keys()))  # hmm keys for the select file.

        for hmm_date_key in hmm_keys:  # pick and hmm date
            feature_label_keys = sorted(
                unpickled_select_file[hmm_date_key].keys(
                ))  # each key here unlocks a feature and label set

            for feature_label_date in feature_label_keys:  # make a list of all the feature dates
                features_file_path = unpickled_select_file[hmm_date_key][
                    feature_label_date][0]  # this is the feature path
                labels_file_path = unpickled_select_file[hmm_date_key][
                    feature_label_date][1]  # this is the labels path

                if os.path.isfile(features_file_path
                                  ):  # if label file exists I can traing
                    print(
                        'ok----->', feature_label_date
                    )  # if you got to this point we have data so we can mov eon
                    labels = pd.read_csv(labels_file_path)  # open labels file
                    label_name = str(
                        labels.columns[labels.columns.str.contains(
                            pat='label')].values[0])
                    features = open_pickle_filepath(
                        features_file_path)  # opens features file
                    hmm_features = nfu.hmm_features_df(
                        features
                    )  # get the hmm features out, so unpack the tuples!
                    print('loaded features and labels ')
                    if hmm_features.isnull().values.all(
                    ):  # checking that the HMM features are actually not null
                        continue
                    else:  # if features not null then start moving on!
                        market_features_df = CreateMarketFeatures(
                            CreateMarketFeatures(
                                CreateMarketFeatures(df=CreateMarketFeatures(
                                    df=labels).ma_spread_duration()).ma_spread(
                                    )).chaikin_mf()).obv_calc(
                                    )  # market features dataframe

                        df_concat = pd.DataFrame(
                            pd.concat([hmm_features, market_features_df],
                                      axis=1,
                                      sort='False').dropna())

                        df = df_concat[df_concat[label_name].notna()]
                        df_final = df.drop(columns=[
                            'TradedPrice', 'Duration', 'TradedTime',
                            'ReturnTradedPrice', 'Volume', label_name
                        ])
                        y_train = df[df.columns[df.columns.str.contains(
                            pat='label')]].iloc[:, 0]  # training labels
                        if df_final.shape[
                                0] < 10:  # make sure it all looks reasonable
                            print(
                                ' the ratio of classes is too low. try another label permutation'
                            )
                            continue
                        else:

                            print("starting model fit")
                            # put the features in a tensor format
                            X = np.asarray(
                                df_final.values)  # need this for torch
                            Xtr = normalization(rescale_01(torch.Tensor(
                                X)))  # features in a tensor format

                            Ytr = torch.Tensor(
                                y_train.values
                            )  # put the labels in a tensor format
                            print(
                                '-----------------first bit done------------------'
                            )
                            KLrbf = generators.RBF_generator(
                                Xtr, gamma=[.01, .1, .25, .5]
                            )  # get a few RBF Kernels ready - maybe need more here
                            print('done with kernel')
                            best_results = {}

                            C_range = [0.1, 1]
                            lam_range = [0.2]
                            try:

                                for C_choice in C_range:
                                    base_learner = SVC(
                                        C=C_choice)  # "hard"-margin svm
                                    # clf = EasyMKL(lam=0.2, multiclass_strategy='ova', learner=base_learner).fit(KLrbf,
                                    #                                                                             Ytr)
                                    # print('done')
                                    # print('the combination weights are:')
                                    #
                                    # for sol in clf.solution:
                                    #     print('(%d vs all): ' % sol,
                                    #           clf.solution[
                                    #               sol].weights)  # need to store these results somewhere

                                    for lam in lam_range:  # possible lambda values for the EasyMKL algorithm
                                        # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns
                                        # accuracy, auc, or F1 scores
                                        scores = cross_val_score(
                                            KLrbf,
                                            Ytr,
                                            EasyMKL(learner=base_learner,
                                                    lam=lam),
                                            n_folds=5,
                                            scoring='accuracy'
                                        )  # get the cross-validation scores
                                        acc = np.mean(scores)
                                        if not best_results or best_results[
                                                'score'] < acc:
                                            best_results = {
                                                'C': C_choice,
                                                'lam': lam,
                                                'score': acc,
                                                'scores': scores
                                            }  # these should get dumped somewhere
                                print('done')
                                best_learner = SVC(C=best_results['C'])
                                clf = EasyMKL(learner=best_learner,
                                              lam=best_results['lam']).fit(
                                                  KLrbf, Ytr)
                                y_pred = clf.predict(KLrbf)
                                accuracy = accuracy_score(Ytr, y_pred)
                                print(
                                    'accuracy on the test set: %.3f, with lambda=%.2f'
                                    % (accuracy, best_results['lam']))
                                print(scores)

                                pickle_out_filename = os.path.join(
                                    mainPath,
                                    "ExperimentCommonLocs/CrossValidationResults",
                                    "_".join((symbol, 'feature_label_date',
                                              str(select_feature_label_date),
                                              str(select_label_idx),
                                              'hmm_date:', hmm_date_key, 'RBF',
                                              'MultiKernelSVC.pkl')))
                                # pickle_out = open(pickle_out_filename, 'wb')
                                # pickle.dump(best_results, pickle_out)
                                # pickle_out.close()

                            except ValueError:
                                continue

                else:
                    print('PROBLEM----->in one of of your locations')
                    continue
Пример #4
0
KLtr, KLte, Ytr, Yte = train_test_split(KL, Y, test_size=.3, random_state=42)

#MKL algorithms
from MKLpy.algorithms import EasyMKL, KOMD  #KOMD is not a MKL algorithm but a simple kernel machine like the SVM
from MKLpy.model_selection import cross_val_score
from sklearn.svm import SVC
import numpy as np
print('tuning lambda for EasyMKL...', end='')
base_learner = SVC(C=10000)  #"hard"-margin svm
best_results = {}
for lam in [0, 0.01, 0.1, 0.2, 0.9,
            1]:  #possible lambda values for the EasyMKL algorithm
    #MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns
    #accuracy, auc, or F1 scores
    scores = cross_val_score(KLtr,
                             Ytr,
                             EasyMKL(learner=base_learner, lam=lam),
                             n_folds=5,
                             scoring='accuracy')
    acc = np.mean(scores)
    if not best_results or best_results['score'] < acc:
        best_results = {'lam': lam, 'score': acc}
#evaluation on the test set
from sklearn.metrics import accuracy_score
print('done')
clf = EasyMKL(learner=base_learner, lam=best_results['lam']).fit(KLtr, Ytr)
y_pred = clf.predict(KLte)
accuracy = accuracy_score(Yte, y_pred)
print('accuracy on the test set: %.3f, with lambda=%.2f' %
      (accuracy, best_results['lam']))
Пример #5
0
                Xtr = normalization(rescale_01(torch.Tensor(pkl_file[date][0].values)))
                Ytr = torch.Tensor(pkl_file[date][1].values)
                print('first bit done')
            nalsvm.gc.collect()
                KLrbf = generators.RBF_generator(Xtr, gamma=[.001, .01, .1])
                print('done with kernel')
                nalsvm.gc.collect()
                try:
                    lam_values = [0, 0.1, 0.2, 1]
                    C_values = [0.01, 1, 10, 100]
                    print(C_values)
                    for lam, C in product(lam_values, C_values):
                        print('now here', C, lam)
                        svm = SVC(C=C)
                        mkl = EasyMKL(lam=lam, learner=svm)
                        scores = cross_val_score(KLrbf, Ytr, mkl, n_folds=3, scoring='accuracy')
                        print(str(scores))
                        print(lam, C, scores)
                        print(type(scores))
                        cv_dict_list[(symbol, date, alternate_label)][(lam, C)] = scores
                        nalsvm.logmemoryusage("Before garbage collect")
                        print('---------------> moving on')

                except (ValueError, TypeError, EOFError):
                    continue
                # only way that seems to work for this
                pickle_out_filename = os.path.join(cross_validation_data_location,
                                                   "_".join((symbol, date, 'RBF_CrossValidationResults.pkl')))
                test_df = pd.DataFrame.from_dict(cv_dict_list)
                test_df.to_pickle(pickle_out_filename)
                print('Now saved: ', pickle_out_filename)
Пример #6
0
def fitting_function_mkl(key):
    print('For key: ', key, '############')
    labels_file_path = os.path.join(
        symbolData.symbol_specific_label_path(label_idx), key + ".csv")
    print(os.path.isfile(labels_file_path))
    output_dict = defaultdict(dict)

    if os.path.isfile(labels_file_path):  # check that this is a real path
        print(" reading labels")  # this is the labels path!
        labels = pd.read_csv(labels_file_path)
        label_name = str(
            labels.columns[labels.columns.str.contains(pat='label')].values[0])
        logmemoryusage("Before garbage collect")
        hmm_features = nfu.hmm_features_df(
            open_pickle_filepath(symbol_feature_paths[key]))

        if hmm_features.isnull().values.all(
        ):  # checking that the HMM features are actually not null
            pass
            print('lots of NaNs on features')
        else:  # if features not null then start moving on!
            print("can train")
            market_features_df = CreateMarketFeatures(
                CreateMarketFeatures(
                    CreateMarketFeatures(df=CreateMarketFeatures(
                        df=labels).ma_spread_duration()).ma_spread()).
                chaikin_mf()).obv_calc()  # market features dataframe

            df_concat = pd.DataFrame(
                pd.concat([hmm_features, market_features_df],
                          axis=1,
                          sort='False').dropna())

            df = df_concat[df_concat[label_name].notna()]
            df_final = df.drop(columns=[
                'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice',
                'Volume', label_name
            ])

            y_train = df.reindex(columns=df.columns[df.columns.str.contains(
                pat='label')])  # training labels
            print('go to the labels')

            if df_final.shape[0] < 10:
                print(
                    ' the ratio of classes is too low. try another label permutation'
                )
                # problem_dict[hmm_date][key] = str(key)
                pass
            else:
                print("starting model fit")

                Xtr, Xte, Ytr, Yte = train_test_split(df_final,
                                                      y_train,
                                                      test_size=.2,
                                                      random_state=42)
                # training
                arrXtr = np.array(Xtr)
                X_tr = normalization(rescale_01(arrXtr))
                Y_tr = torch.Tensor(Ytr.values.ravel())

                # testing

                arrXte = np.array(Xte)
                X_te = normalization(rescale_01(arrXte))
                Y_te = torch.Tensor(Yte.values.ravel())

                KLtr = [
                    pairwise.homogeneous_polynomial_kernel(X_tr, degree=d)
                    for d in range(1, 11)
                ] + [identity_kernel(len(Y_tr))]
                KLte = [
                    pairwise.homogeneous_polynomial_kernel(X_te,
                                                           X_tr,
                                                           degree=d)
                    for d in range(1, 11)
                ]
                KLte.append(torch.zeros(KLte[0].size()))
                print('done with kernel')
                try:
                    lam_values = [0.1, 0.2, 1]
                    best_results = {}
                    C_range = [0.1, 1]
                    for C_ch in C_range:
                        base_learner = SVC(C=C_ch)  # "soft"-margin svm
                        print(' fitted the base learner')
                        # possible lambda values for the EasyMKL algorithm
                        for lam in lam_values:
                            print('now here', lam)
                            print(' and tuning lambda for EasyMKL...', end='')
                            base_learner = SVC(C=C_ch)  # "soft"-margin svm
                            # MKLpy.model_selection.cross_val_score performs the cross validation automatically,
                            # it may returns accuracy, auc, or F1 scores
                            scores = cross_val_score(KLtr,
                                                     Y_tr,
                                                     EasyMKL(
                                                         learner=base_learner,
                                                         lam=lam),
                                                     n_folds=5,
                                                     scoring='accuracy')
                            acc = np.mean(scores)
                            if not best_results or best_results['score'] < acc:
                                best_results = {'lam': lam, 'score': acc}
                            # evaluation on the test set

                            print('done', best_results)
                            cv_dict_list[(symbol, hmm_date,
                                          label_idx)][(lam, C_ch)] = [
                                              scores, best_results
                                          ]
                            print(cv_dict_list)

                            pickle_out_filename = os.path.join(
                                mainPath,
                                "ExperimentCommonLocs/MKLFittedModels",
                                "_".join((symbol, 'model_fit_date', str(key),
                                          str(alternate_labels_nos[label_idx]),
                                          'MultiKernelSVC.pkl')))
                            print(pickle_out_filename)

                            pickle_out = open(pickle_out_filename, 'wb')

                            pickle.dump(cv_dict_list, pickle_out)
                            pickle_out.close()

                except (ValueError, TypeError, EOFError):
                    pass