示例#1
0
 def test_normalization(self):
     Xn = preprocessing.normalization(self.X)
     K = Xn @ Xn.T
     self.assertAlmostEqual(K.max().item(), 1., places=6)
     self.assertAlmostEqual(K.diag().min().item(), 1., places=6)
     self.assertEqual(Xn.shape, (5, 4))
     o_torch = preprocessing.normalization(self.X)
     o_numpy = preprocessing.normalization(self.Xnumpy)
     self.assertTrue(matNear(o_torch, o_numpy))
     self.assertEqual(type(o_torch), torch.Tensor)
     self.assertEqual(type(o_numpy), torch.Tensor)
示例#2
0
def featureCreation(idxKey, locDict):
    """ gives out clean features and labels for a given locDict and a idxKey """
    keys = list(locDict.keys())
    featuresIdxDirFileLoc = locDict[keys[idxKey]][0]
    labelsIdxDirFileLoc = locDict[keys[idxKey]][1]
    dataDate = keys[idxKey]
    ''' read the features file'''
    featuresTupleFile = pkl.load(open(featuresIdxDirFileLoc, "rb"),
                                 encoding='latin1')
    dfFeatures = pd.concat([featuresTupleFile[0], featuresTupleFile[1], \
                            featuresTupleFile[2], featuresTupleFile[3]], axis=1, sort=False).fillna(0)
    ''' read the labels file'''
    labelsDf = pd.read_csv(labelsIdxDirFileLoc)
    ''' pop the labels out'''
    labels = labelsDf['label_PrMov__window_5__thres_arbitrary__0.1']
    '''dataframe of Features and Labels - X and Y'''
    dfXY = pd.concat([dfFeatures, labels], axis=1, sort='False').dropna()
    labelName = str(
        dfXY.columns[dfXY.columns.str.contains(pat='label')].values[0])
    ''' drop the labels from the features'''
    dfX = dfXY.drop(columns=[labelName])
    arrX = np.array(dfX)
    ''' feature normalisation'''
    # feature scaling in [0,1] - X = rescale_01(arrX)
    X = normalization(rescale_01(arrX))
    y = dfXY[dfXY.columns[dfXY.columns.str.contains(pat='label')]].iloc[:, 0]
    ''' returns features, labels'''
    return X, y, dataDate
示例#3
0
 def setUp(self):
     data = load_digits()
     self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split(
         data.data, data.target, shuffle=True, train_size=.2)
     self.Xtr = preprocessing.normalization(self.Xtr)
     self.Xte = preprocessing.normalization(self.Xte)
     self.KLtr = [
         pairwise.homogeneous_polynomial_kernel(self.Xtr, degree=d)
         for d in range(1, 11)
     ]
     self.KLte = [
         pairwise.homogeneous_polynomial_kernel(self.Xte,
                                                self.Xtr,
                                                degree=d)
         for d in range(1, 11)
     ]
示例#4
0
 def setUp(self):
     data = load_breast_cancer()
     self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split(
         data.data, data.target, shuffle=True, train_size=50)
     self.Xtr = preprocessing.normalization(self.Xtr)
     self.Xte = preprocessing.normalization(self.Xte)
     self.KLtr = [
         pairwise.homogeneous_polynomial_kernel(self.Xtr, degree=d)
         for d in range(1, 11)
     ]
     self.KLte = [
         pairwise.homogeneous_polynomial_kernel(self.Xte,
                                                self.Xtr,
                                                degree=d)
         for d in range(1, 11)
     ]
     self.KLtr_g = HPK_generator(self.Xtr, degrees=range(1, 6))
     self.KLte_g = HPK_generator(self.Xte, self.Xtr, degrees=range(1, 6))
示例#5
0
 def test_kernel_normalization(self):
     K = self.X @ self.X.T
     Kn_torch = preprocessing.kernel_normalization(K)
     Kn_numpy = preprocessing.kernel_normalization(K.numpy())
     self.assertAlmostEqual(Kn_torch.max().item(), 1., places=6)
     self.assertAlmostEqual(Kn_torch.diag().min().item(), 1., places=6)
     self.assertEqual(Kn_torch.shape, (5, 5))
     self.assertTrue(matNear(Kn_torch, Kn_numpy))
     self.assertEqual(type(Kn_torch), torch.Tensor)
     self.assertEqual(type(Kn_numpy), torch.Tensor)
     linear = pairwise_mk.linear_kernel(preprocessing.normalization(self.X))
     self.assertTrue(matNear(Kn_torch, linear, eps=1e-7))
示例#6
0
 def setUp(self):
     data = load_breast_cancer()
     self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split(
         data.data, data.target, shuffle=True, train_size=50)
     self.Xtr = preprocessing.normalization(self.Xtr)
     self.Xte = preprocessing.normalization(self.Xte)
     self.KLtr = [
         pairwise_mk.homogeneous_polynomial_kernel(self.Xtr, degree=d)
         for d in range(5, 11)
     ] + [misc.identity_kernel(len(self.Xtr))]  #.Double()]
     self.KLte = [
         pairwise_mk.homogeneous_polynomial_kernel(
             self.Xte, self.Xtr, degree=d) for d in range(5, 11)
     ] + [torch.zeros(len(self.Xte), len(self.Xtr))
          ]  #, dtype=torch.double)]
     self.KLtr_g = HPK_generator(self.Xtr,
                                 degrees=range(5, 11),
                                 include_identity=True)
     self.KLte_g = HPK_generator(self.Xte,
                                 self.Xtr,
                                 degrees=range(5, 11),
                                 include_identity=True)
示例#7
0
def MultiView_learning():
    """MultiView learning"""
    print('loading dataset...', end='')

    training_data = io.loadmat(
        r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca_file_0202.mat"
    )
    length = len(training_data['array'][0])
    X, Y = training_data['array'][:, 0:length - 2], training_data['array'][:,
                                                                           -1]
    print('done')

    # preprocess data
    print('preprocessing data...', end='')
    from MKLpy.preprocessing import normalization, rescale_01
    X = rescale_01(X)  # feature scaling in [0,1]
    X = normalization(X)  # ||X_i||_2^2 = 1

    # train/test split
    from sklearn.model_selection import train_test_split
    Xtr, Xte, Ytr, Yte = train_test_split(X,
                                          Y,
                                          test_size=.1,
                                          random_state=42,
                                          shuffle=True)

    print(numpy.array(Xtr).shape)
    print(numpy.array(Ytr).shape)

    print('done')
    print('Training on {0} samples, Testing on {1} samples'.format(
        len(Xtr), len(Xte)))

    print('computing RBF Kernels...', end='')

    from MKLpy.metrics import pairwise
    from MKLpy.generators import Multiview_generator

    X1_tr = numpy.array(Xtr[:, :2])  # time
    X2_tr = numpy.array(Xtr[:, 2:92])  # color
    X3_tr = numpy.array(Xtr[:, 92:124])  # Gabor
    X4_tr = numpy.array(Xtr[:, 124:156])  # lbp
    X5_tr = numpy.array(Xtr[:, 156:348])  # cloud
    X6_tr = numpy.array(Xtr[:, 348:432])  # haze
    X7_tr = numpy.array(Xtr[:, 432:603])  # contrast
    X8_tr = numpy.array(Xtr[:, 603:606])  # shadow
    X9_tr = numpy.array(Xtr[:, 606:608])  # snow
    X10_tr = numpy.array(Xtr[:, 608:])  # pca

    X1_te = numpy.array(Xte[:, :2])  # time
    X2_te = numpy.array(Xte[:, 2:92])  # color
    X3_te = numpy.array(Xte[:, 92:124])  # Gabor
    X4_te = numpy.array(Xte[:, 124:156])  # lbp
    X5_te = numpy.array(Xte[:, 156:348])  # cloud
    X6_te = numpy.array(Xte[:, 348:432])  # haze
    X7_te = numpy.array(Xte[:, 432:603])  # contrast
    X8_te = numpy.array(Xte[:, 603:606])  # shadow
    X9_te = numpy.array(Xte[:, 606:608])  # snow
    X10_te = numpy.array(Xte[:, 608:])  # pca

    KLtr = Multiview_generator([
        X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr
    ],
                               kernel=pairwise.rbf_kernel)
    KLte = Multiview_generator([
        X1_te, X2_te, X3_te, X4_te, X5_te, X6_te, X7_te, X8_te, X9_te, X10_te
    ], [X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr],
                               kernel=pairwise.rbf_kernel)

    print('done')

    from MKLpy.algorithms import AverageMKL, EasyMKL
    print('training EasyMKL with one-vs-all multiclass strategy...', end='')
    from sklearn.svm import SVC
    base_learner = SVC(C=8)
    clf = EasyMKL(lam=0.1, multiclass_strategy='ova',
                  learner=base_learner).fit(KLtr, Ytr)

    print('the combination weights are:')
    for sol in clf.solution:
        print('(%d vs all): ' % sol, clf.solution[sol].weights)

    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
    y_pred = clf.predict(KLte)  # predictions
    y_score = clf.decision_function(KLte)  # rank
    accuracy = accuracy_score(Yte, y_pred)
    print('Accuracy score: %.4f' % (accuracy))
    recall = recall_score(Yte, y_pred, average='macro')
    print('Recall score: %.4f' % (recall))
    cm = confusion_matrix(Yte, y_pred)
    print('Confusion matrix', cm)

    print('training EasyMKL with one-vs-one multiclass strategy...', end='')
    clf = EasyMKL(lam=0.1, multiclass_strategy='ovo',
                  learner=base_learner).fit(KLtr, Ytr)
    print('done')
    print('the combination weights are:')
    for sol in clf.solution:
        print('(%d vs %d): ' % (sol[0], sol[1]), clf.solution[sol].weights)

    y_pred = clf.predict(KLte)  # predictions
    y_score = clf.decision_function(KLte)  # rank
    accuracy = accuracy_score(Yte, y_pred)
    print('Accuracy score: %.4f' % (accuracy))
    recall = recall_score(Yte, y_pred, average='macro')
    print('Recall score: %.4f' % (recall))
    cm = confusion_matrix(Yte, y_pred)
    print('Confusion matrix', cm)
示例#8
0
def Learning_curve_using_weather_data():
    '''
         Cross validation using weather data: PASS: 2021.02.05
    '''
    # load data
    print('loading dataset...', end='')
    # from sklearn.datasets import load_breast_cancer as load
    # ds = load()
    # X, Y = ds.data, ds.target

    # # Files
    training_data = io.loadmat(
        r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca.mat")
    # training_data = io.loadmat(r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca_file.mat")
    # training_data = io.loadmat(r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca_file_0202.mat")
    results_data = open(
        r"D:\CVProject\CBAM-keras-master\handcraft\results\learning_curve_results_0202_01.txt",
        "w")

    # length = len(training_data['array'][0])
    length = len(training_data['array'][0])

    # X, Y = training_data['array'][:, 0:length - 1], training_data['array'][:, -1]

    X, Y = training_data['array'][:, 0:length - 1], training_data['array'][:,
                                                                           -1]

    print('done')

    # preprocess data
    print('preprocessing data...', end='')
    from MKLpy.preprocessing import normalization, rescale_01
    X = rescale_01(X)  # feature scaling in [0,1]
    X = normalization(X)  # ||X_i||_2^2 = 1
    print('done')

    from MKLpy.algorithms import EasyMKL, KOMD  # KOMD is not a WeatherClsMKL algorithm but a simple kernel machine like the SVM
    from MKLpy.model_selection import cross_val_score
    from sklearn.svm import SVC
    import numpy as np
    # base_learner = SVC(C=10000)  # "hard"-margin svm
    print("Build a base learner")
    base_learner = SVC(C=20)  # "hard"-margin svm

    # # # === parameters selection ===
    # best_results = {}
    # # for lam in [0, 0.01, 0.1, 0.2, 0.9, 1]:  # possible lambda values for the EasyMKL algorithm
    # for lam in [0]:  # possible lambda values for the EasyMKL algorithm
    #     # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns
    #     # accuracy, auc, or F1 scores
    #     # evaluation on the test set
    #     print("Model training with lam {}".format(lam))
    #     clf = EasyMKL(lam=0.1, multiclass_strategy='ova', learner=base_learner).fit(KLtr, Ytr)
    #     scores = cross_val_score(KLtr, Ytr, clf, n_folds=5, scoring='accuracy')
    #     acc = np.mean(scores)
    #     if not best_results or best_results['score'] < acc:
    #         best_results = {'lam': lam, 'score': acc}

    print("Build EasyMKL classifier")
    # clf = EasyMKL(lam=0.1, multiclass_strategy='ova', learner=base_learner).fit(KLtr, Ytr)
    # scores = cross_val_score(KLtr, Ytr, clf, n_folds=5, scoring='accuracy')
    # acc = np.mean(scores)
    # print("acc:", acc)

    # ====== Learning curve =======
    #
    # X1_tr = numpy.array(Xtr[:, :2])             # time
    # X2_tr = numpy.array(Xtr[:, 2:92])          # color
    # X3_tr = numpy.array(Xtr[:, 92:124])        # Gabor
    # X4_tr = numpy.array(Xtr[:, 124:156])       # lbp
    # X5_tr = numpy.array(Xtr[:, 156:348])       # cloud
    # X6_tr = numpy.array(Xtr[:, 348:432])       # haze
    # X7_tr = numpy.array(Xtr[:, 432:603])       # contrast
    # X8_tr = numpy.array(Xtr[:, 603:651])       # shadow
    # X9_tr = numpy.array(Xtr[:, 606:683])       # snow
    # X10_tr = numpy.array(Xtr[:, 683:])          # pca
    #
    # X1_te = numpy.array(Xte[:, :2])             # time
    # X2_te = numpy.array(Xte[:, 2:92])          # color
    # X3_te = numpy.array(Xte[:, 92:124])        # Gabor
    # X4_te = numpy.array(Xte[:, 124:156])       # lbp
    # X5_te = numpy.array(Xte[:, 156:348])       # cloud
    # X6_te = numpy.array(Xte[:, 348:432])       # haze
    # X7_te = numpy.array(Xte[:, 432:603])       # contrast
    # X8_te = numpy.array(Xte[:, 603:651])       # shadow
    # X9_te = numpy.array(Xte[:, 606:683])       # snow
    # X10_te = numpy.array(Xte[:, 683:])       # pca
    # #
    # # # # all features
    # KLtr = Multiview_generator([X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.rbf_kernel)
    # KLte = Multiview_generator([X1_te, X2_te, X3_te, X4_te, X5_te, X6_te, X7_te, X8_te, X9_te, X10_te], [X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.rbf_kernel)
    #
    # KYtr = Ytr[:]
    # KYte = Yte[:]

    # for elem in [0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    for elem in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
        # for elem in [1]:
        learn_count = int(elem * X.shape[0])
        KLtr, KYtr, KLte, KYte = bulid_kernel_transform(
            X[:learn_count], Y[:learn_count])

        train_count, test_count = len(KYtr), len(KYte)

        clf = EasyMKL(lam=0.1, multiclass_strategy='ova',
                      learner=base_learner).fit(KLtr, KYtr)
        # scores = cross_val_score(KLtr, Ytr, clf, n_folds=5, scoring='accuracy')
        # acc = np.mean(scores)
        y_train_pred = clf.predict(KLtr)
        y_test_pred = clf.predict(KLte)

        train_set_accuracy = accuracy_score(KYtr, y_train_pred)
        tests_et_accuracy = accuracy_score(KYte, y_test_pred)

        # display the results
        print("Test on {0} train samples and {1} test samples,".format(
            train_count, test_count),
              end="")
        print(
            'accuracy on the train set: %.3f and accuracy on the test set : %.3f'
            % (train_set_accuracy, tests_et_accuracy))

        # save the results in txt
        print("Test on {0} train samples and {1} test samples,".format(
            train_count, test_count),
              end="",
              file=results_data)
        print(
            'accuracy on the train set: %.3f and accuracy on the test set : %.3f'
            % (train_set_accuracy, tests_et_accuracy),
            file=results_data)

    # from sklearn.metrics import accuracy_score
    print('done')
    # ==============================

    pass
    # # # ===== evaluate the model =====
    # # # Chose the model with high performance
    #
    # # Transform
    # X1_tr = numpy.array(Xtr[:, :2])             # time
    # X2_tr = numpy.array(Xtr[:, 2:92])          # color
    # X3_tr = numpy.array(Xtr[:, 92:124])        # Gabor
    # X4_tr = numpy.array(Xtr[:, 124:156])       # lbp
    # X5_tr = numpy.array(Xtr[:, 156:348])       # cloud
    # X6_tr = numpy.array(Xtr[:, 348:432])       # haze
    # X7_tr = numpy.array(Xtr[:, 432:603])       # contrast
    # X8_tr = numpy.array(Xtr[:, 603:606])       # shadow
    # X9_tr = numpy.array(Xtr[:, 606:608])       # snow
    # X10_tr = numpy.array(Xtr[:, 608:])          # pca
    #
    # X1_te = numpy.array(Xte[:, :2])             # time
    # X2_te = numpy.array(Xte[:, 2:92])          # color
    # X3_te = numpy.array(Xte[:, 92:124])        # Gabor
    # X4_te = numpy.array(Xte[:, 124:156])       # lbp
    # X5_te = numpy.array(Xte[:, 156:348])       # cloud
    # X6_te = numpy.array(Xte[:, 348:432])       # haze
    # X7_te = numpy.array(Xte[:, 432:603])       # contrast
    # X8_te = numpy.array(Xte[:, 603:606])       # shadow
    # X9_te = numpy.array(Xte[:, 606:608])       # snow
    # X10_te = numpy.array(Xte[:, 608:])       # pca
    #
    # # # all features
    # KLtr = Multiview_generator([X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.homogeneous_polynomial_kernel)
    # KLte = Multiview_generator([X1_te, X2_te, X3_te, X4_te, X5_te, X6_te, X7_te, X8_te, X9_te, X10_te], [X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.homogeneous_polynomial_kernel)
    #
    # KYtr = Ytr[:]
    # KYte = Yte[:]
    #
    # clf = EasyMKL(learner=base_learner, lam=0.1).fit(KLtr, KYtr)
    # y_train_pred = clf.predict(KLtr)
    # y_test_pred = clf.predict(KLte)
    #
    # train_set_accuracy = accuracy_score(KYtr, y_train_pred)
    # tests_et_accuracy = accuracy_score(KYte, y_test_pred)
    #
    # # print('accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam']))
    # print('accuracy on the train set: %.3f, and accuracy on the test set : %.3f' % (train_set_accuracy, tests_et_accuracy))
    # # ======================
    pass
示例#9
0
        # location of data -->dataDrive, Clean Data Storage, and label.
        pkl_file = load_pickled_in_filename(
            os.path.join(clean_data_location, os.listdir(clean_data_location)[alternate_label_idx]))
        date_keys = list(pkl_file.keys())
        # model_date = '20170704'
        # list of out of sample dates
        for model_date in model_dates:

            start = time.time()
            # forward_dates = nalsvm.forwardDates(date_keys, model_date)

            print('---------------> Doing Model Date:', model_date)

            # put the features in a tensor format
            Xtr = rescale_01(torch.Tensor(pkl_file[model_date][0].values))
            Xtr = normalization(Xtr)  # fitting model
            # put the labels in a tensor format
            Ytr = torch.Tensor(pkl_file[model_date][1].values)
            #
            try:

                KLtr_poly = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(6)]
                deg = 5
                K = KLtr_poly[deg]  # the HPK with degree 5
                # K is always a squared kernel matrix, i.e. it is not the kernel computed between test and training
                # examples.
                kernel_evaluation_dict = kernel_evaluation(K)
                print('done')
                print('results of the %d-degree HP kernel:' % deg)
                print('margin: %.4f, radius: %.4f, radiu-margin ratio: %.4f,' % (kernel_evaluation_dict['score_margin'], kernel_evaluation_dict['score_radius'], kernel_evaluation_dict['score_ratio']))
                print('trace: %.4f, frobenius norm: %.4f' % (kernel_evaluation_dict['score_trace'], kernel_evaluation_dict['score_froben']))
示例#10
0
from sklearn.datasets import load_breast_cancer
ds = load_breast_cancer()
X, Y = ds.data, ds.target
print('done')
'''
WARNING: be sure that your matrix is not sparse! EXAMPLE:
from sklearn.datasets import load_svmlight_file
X,Y = load_svmlight_file(...)
X = X.toarray()
'''

#preprocess data
print('preprocessing data...', end='')
from MKLpy.preprocessing import normalization, rescale_01
X = rescale_01(X)  #feature scaling in [0,1]
X = normalization(X)  #||X_i||_2^2 = 1

#train/test split
from sklearn.model_selection import train_test_split
Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=.25, random_state=42)
print('done')

#compute homogeneous polynomial kernels with degrees 0,1,2,...,10.
print('computing Homogeneous Polynomial Kernels...', end='')
from MKLpy.metrics import pairwise
KLtr = [
    pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(11)
]
KLte = [
    pairwise.homogeneous_polynomial_kernel(Xte, Xtr, degree=d)
    for d in range(11)
示例#11
0
 def test_normalization(self):
     Xn = preprocessing.normalization(self.X)
     K = Xn.dot(Xn.T)
     self.assertAlmostEqual(K.max(), 1.)
     self.assertAlmostEqual(np.diag(K).min(), 1.)
     self.assertEqual(Xn.shape, (5, 4))
示例#12
0
 for symbol in ['ECM.L']:
     print(symbol)  # which symbol - unnecessary at this point
     cross_validation_data_location = cross_validation_results_location(symbol)
     clean_data_location = storage_location(symbol)
     for alternate_label_idx in range(0, 4):
         alternate_label = nalsvm.labels_pickle_files[alternate_label_idx]
         print(alternate_label)
         file_to_load = os.path.join(clean_data_location, os.listdir(clean_data_location)[alternate_label_idx])
         pkl_file = load_pickled_in_filename(file_to_load)
         date_keys = list(pkl_file.keys())
         print('--------------->')
         for date in date_keys:  # date is model fit-date i.e the date we pick up to fit the training model in CV
             print(date)
             start = time.time()
             nalsvm.logmemoryusage("Before garbage collect")
             Xtr = normalization(rescale_01(torch.Tensor(pkl_file[date][0].values)))
             Ytr = torch.Tensor(pkl_file[date][1].values)
             print('first bit done')
         nalsvm.gc.collect()
             KLrbf = generators.RBF_generator(Xtr, gamma=[.001, .01, .1])
             print('done with kernel')
             nalsvm.gc.collect()
             try:
                 lam_values = [0, 0.1, 0.2, 1]
                 C_values = [0.01, 1, 10, 100]
                 print(C_values)
                 for lam, C in product(lam_values, C_values):
                     print('now here', C, lam)
                     svm = SVC(C=C)
                     mkl = EasyMKL(lam=lam, learner=svm)
                     scores = cross_val_score(KLrbf, Ytr, mkl, n_folds=3, scoring='accuracy')
示例#13
0
        model_dates = model_dates_list(return_cross_val_symbol_path(symbol))
        # location of data -->dataDrive, Clean Data Storage, and label.
        pkl_file = load_pickled_in_filename(
            os.path.join(clean_data_location,
                         os.listdir(clean_data_location)[alternate_label_idx]))
        date_keys = list(pkl_file.keys())
        # list of out of sample dates
        model_date = '20170705'  # <-need to replace this with file reading model dates above
        for model_date in model_dates:
            forward_dates = nalsvm.forwardDates(date_keys, model_date)

            print('---------------> Doing Model Date:', model_date)
            try:
                # put the features in a tensor format
                Xtr = normalization(
                    rescale_01(torch.Tensor(
                        pkl_file[model_date][0].values)))  # fitting model
                # put the labels in a tensor format
                Ytr = torch.Tensor(pkl_file[model_date][1].values)
                print('first bit done')
                # force garbage collect
                nalsvm.gc.collect()
                # kernels
                KLrbf = generators.RBF_generator(Xtr, gamma=[.001, .01, .1])
                # dont need the next bit
                print('done with kernel')
                print(forward_dates)
                # base learner- use c =1 or 10
                # the c and lambda values need to be picked up by the cross-val results !
                base_learner = SVC(C=10)
示例#14
0
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# load the dataset
feature1 = pd.read_csv('..............................')

X1 = feature1.iloc[:, 1:].values
y1 = feature1.iloc[:, 0].values

# Preprocess data
from MKLpy.preprocessing import normalization, rescale_01

X1 = rescale_01(X1)
X1 = normalization(X1)

# # train/test
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X1,
                                                            y1,
                                                            test_size=0.3,
                                                            random_state=42)

# Applying Polynomial kernel
from MKLpy.metrics import pairwise

k1 = [
    pairwise.homogeneous_polynomial_kernel(X_train_A, degree=d)
    for d in range(5)
]
k11 = [
示例#15
0
'''

This is a snippet of code showing how to train a MKL algorithm

Author: Ivano Lauriola, [email protected]

'''

from sklearn.datasets import load_iris
ds = load_iris()
X,Y = ds.data, ds.target


from MKLpy.preprocessing import normalization
X = normalization(X)


from sklearn.model_selection import train_test_split
Xtr,Xte,Ytr,Yte = train_test_split(X,Y, test_size=.5, random_state=42)


from MKLpy.metrics import pairwise
from MKLpy.utils.matrices import identity_kernel
import numpy as np

#making 20 homogeneous polynomial kernels.
#I suggest to add the identity kernel in order to make the GRAM initial solution easily separable
#if the initial sol is not separable, GRAM may not work well
KLtr = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(1,21)] + [identity_kernel(len(Ytr))]
KLte = [pairwise.homogeneous_polynomial_kernel(Xte,Xtr, degree=d) for d in range(1,21)]
KLte.append(np.zeros(KLte[0].shape))
示例#16
0
    def parallelised_function(file):
        select_file_path = os.path.join(jointFeatureLocation,
                                        file)  # formulate the path
        print('Symbol:----->', file.split("_")[0])
        symbol = file.split("_")[0]

        select_hmm_date = select_file_path.split("_")[
            3]  # pull out the hmm_date - strip it out

        select_feature_label_date = select_file_path.split("_")[
            6]  # pull out the label_feature_date

        select_label_idx = select_file_path.split("_")[
            9]  # pull out the label _idx

        unpickled_select_file = open_pickle_filepath(
            select_file_path)  # unplickle the select file

        hmm_keys = sorted(list(
            unpickled_select_file.keys()))  # hmm keys for the select file.

        for hmm_date_key in hmm_keys:  # pick and hmm date
            feature_label_keys = sorted(
                unpickled_select_file[hmm_date_key].keys(
                ))  # each key here unlocks a feature and label set

            for feature_label_date in feature_label_keys:  # make a list of all the feature dates
                features_file_path = unpickled_select_file[hmm_date_key][
                    feature_label_date][0]  # this is the feature path
                labels_file_path = unpickled_select_file[hmm_date_key][
                    feature_label_date][1]  # this is the labels path

                if os.path.isfile(features_file_path
                                  ):  # if label file exists I can traing
                    print(
                        'ok----->', feature_label_date
                    )  # if you got to this point we have data so we can mov eon
                    labels = pd.read_csv(labels_file_path)  # open labels file
                    label_name = str(
                        labels.columns[labels.columns.str.contains(
                            pat='label')].values[0])
                    features = open_pickle_filepath(
                        features_file_path)  # opens features file
                    hmm_features = nfu.hmm_features_df(
                        features
                    )  # get the hmm features out, so unpack the tuples!
                    print('loaded features and labels ')
                    if hmm_features.isnull().values.all(
                    ):  # checking that the HMM features are actually not null
                        continue
                    else:  # if features not null then start moving on!
                        market_features_df = CreateMarketFeatures(
                            CreateMarketFeatures(
                                CreateMarketFeatures(df=CreateMarketFeatures(
                                    df=labels).ma_spread_duration()).ma_spread(
                                    )).chaikin_mf()).obv_calc(
                                    )  # market features dataframe

                        df_concat = pd.DataFrame(
                            pd.concat([hmm_features, market_features_df],
                                      axis=1,
                                      sort='False').dropna())

                        df = df_concat[df_concat[label_name].notna()]
                        df_final = df.drop(columns=[
                            'TradedPrice', 'Duration', 'TradedTime',
                            'ReturnTradedPrice', 'Volume', label_name
                        ])
                        y_train = df[df.columns[df.columns.str.contains(
                            pat='label')]].iloc[:, 0]  # training labels
                        if df_final.shape[
                                0] < 10:  # make sure it all looks reasonable
                            print(
                                ' the ratio of classes is too low. try another label permutation'
                            )
                            continue
                        else:

                            print("starting model fit")
                            # put the features in a tensor format
                            X = np.asarray(
                                df_final.values)  # need this for torch
                            Xtr = normalization(rescale_01(torch.Tensor(
                                X)))  # features in a tensor format

                            Ytr = torch.Tensor(
                                y_train.values
                            )  # put the labels in a tensor format
                            print(
                                '-----------------first bit done------------------'
                            )
                            KLrbf = generators.RBF_generator(
                                Xtr, gamma=[.01, .1, .25, .5]
                            )  # get a few RBF Kernels ready - maybe need more here
                            print('done with kernel')
                            best_results = {}

                            C_range = [0.1, 1]
                            lam_range = [0.2]
                            try:

                                for C_choice in C_range:
                                    base_learner = SVC(
                                        C=C_choice)  # "hard"-margin svm
                                    # clf = EasyMKL(lam=0.2, multiclass_strategy='ova', learner=base_learner).fit(KLrbf,
                                    #                                                                             Ytr)
                                    # print('done')
                                    # print('the combination weights are:')
                                    #
                                    # for sol in clf.solution:
                                    #     print('(%d vs all): ' % sol,
                                    #           clf.solution[
                                    #               sol].weights)  # need to store these results somewhere

                                    for lam in lam_range:  # possible lambda values for the EasyMKL algorithm
                                        # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns
                                        # accuracy, auc, or F1 scores
                                        scores = cross_val_score(
                                            KLrbf,
                                            Ytr,
                                            EasyMKL(learner=base_learner,
                                                    lam=lam),
                                            n_folds=5,
                                            scoring='accuracy'
                                        )  # get the cross-validation scores
                                        acc = np.mean(scores)
                                        if not best_results or best_results[
                                                'score'] < acc:
                                            best_results = {
                                                'C': C_choice,
                                                'lam': lam,
                                                'score': acc,
                                                'scores': scores
                                            }  # these should get dumped somewhere
                                print('done')
                                best_learner = SVC(C=best_results['C'])
                                clf = EasyMKL(learner=best_learner,
                                              lam=best_results['lam']).fit(
                                                  KLrbf, Ytr)
                                y_pred = clf.predict(KLrbf)
                                accuracy = accuracy_score(Ytr, y_pred)
                                print(
                                    'accuracy on the test set: %.3f, with lambda=%.2f'
                                    % (accuracy, best_results['lam']))
                                print(scores)

                                pickle_out_filename = os.path.join(
                                    mainPath,
                                    "ExperimentCommonLocs/CrossValidationResults",
                                    "_".join((symbol, 'feature_label_date',
                                              str(select_feature_label_date),
                                              str(select_label_idx),
                                              'hmm_date:', hmm_date_key, 'RBF',
                                              'MultiKernelSVC.pkl')))
                                # pickle_out = open(pickle_out_filename, 'wb')
                                # pickle.dump(best_results, pickle_out)
                                # pickle_out.close()

                            except ValueError:
                                continue

                else:
                    print('PROBLEM----->in one of of your locations')
                    continue
示例#17
0
ds = load_breast_cancer()
X,Y = ds.data, ds.target
print ('done')

'''
WARNING: be sure that your matrix is not sparse! EXAMPLE:
from sklearn.datasets import load_svmlight_file
X,Y = load_svmlight_file(...)
X = X.toarray()
'''

#preprocess data
print ('preprocessing data...', end='')
from MKLpy.preprocessing import normalization, rescale_01
X = rescale_01(X)	#feature scaling in [0,1]
X = normalization(X) #||X_i||_2^2 = 1

#train/test split
from sklearn.model_selection import train_test_split
Xtr,Xte,Ytr,Yte = train_test_split(X,Y, test_size=.25, random_state=42)
print ('done')


#compute homogeneous polynomial kernels with degrees 0,1,2,...,10.
print ('computing Homogeneous Polynomial Kernels...', end='')
from MKLpy.metrics import pairwise
KLtr = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(11)]
KLte = [pairwise.homogeneous_polynomial_kernel(Xte,Xtr, degree=d) for d in range(11)]
print ('done')

示例#18
0
X2 = feature2.iloc[:, 1:].values
y2 = feature2.iloc[:, 0].values
X3 = feature3.iloc[:, 1:].values
y3 = feature3.iloc[:, 0].values
X4 = feature4.iloc[:, 1:].values
y4 = feature4.iloc[:, 0].values
X5 = feature5.iloc[:, 1:].values
y5 = feature5.iloc[:, 0].values
X6 = feature6.iloc[:, 1:].values
y6 = feature6.iloc[:, 0].values

# Preprocess data
from MKLpy.preprocessing import normalization, rescale_01

X1 = rescale_01(X1)  # feature scaling in [0,1]
X1 = normalization(X1)  # ||X_i||_2^2 = 1
X2 = rescale_01(X2)
X2 = normalization(X2)
X3 = rescale_01(X3)
X3 = normalization(X3)
X4 = rescale_01(X4)
X4 = normalization(X4)
X5 = rescale_01(X5)
X5 = normalization(X5)
X6 = rescale_01(X6)
X6 = normalization(X6)

# # train/test
X_tr_A, X_te_A, y_tr_A, y_te_A = train_test_split(X1,
                                                  y1,
                                                  test_size=0.3,
示例#19
0
def fitting_function_mkl(key):
    print('For key: ', key, '############')
    labels_file_path = os.path.join(
        symbolData.symbol_specific_label_path(label_idx), key + ".csv")
    print(os.path.isfile(labels_file_path))
    output_dict = defaultdict(dict)

    if os.path.isfile(labels_file_path):  # check that this is a real path
        print(" reading labels")  # this is the labels path!
        labels = pd.read_csv(labels_file_path)
        label_name = str(
            labels.columns[labels.columns.str.contains(pat='label')].values[0])
        logmemoryusage("Before garbage collect")
        hmm_features = nfu.hmm_features_df(
            open_pickle_filepath(symbol_feature_paths[key]))

        if hmm_features.isnull().values.all(
        ):  # checking that the HMM features are actually not null
            pass
            print('lots of NaNs on features')
        else:  # if features not null then start moving on!
            print("can train")
            market_features_df = CreateMarketFeatures(
                CreateMarketFeatures(
                    CreateMarketFeatures(df=CreateMarketFeatures(
                        df=labels).ma_spread_duration()).ma_spread()).
                chaikin_mf()).obv_calc()  # market features dataframe

            df_concat = pd.DataFrame(
                pd.concat([hmm_features, market_features_df],
                          axis=1,
                          sort='False').dropna())

            df = df_concat[df_concat[label_name].notna()]
            df_final = df.drop(columns=[
                'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice',
                'Volume', label_name
            ])

            y_train = df.reindex(columns=df.columns[df.columns.str.contains(
                pat='label')])  # training labels
            print('go to the labels')

            if df_final.shape[0] < 10:
                print(
                    ' the ratio of classes is too low. try another label permutation'
                )
                # problem_dict[hmm_date][key] = str(key)
                pass
            else:
                print("starting model fit")

                Xtr, Xte, Ytr, Yte = train_test_split(df_final,
                                                      y_train,
                                                      test_size=.2,
                                                      random_state=42)
                # training
                arrXtr = np.array(Xtr)
                X_tr = normalization(rescale_01(arrXtr))
                Y_tr = torch.Tensor(Ytr.values.ravel())

                # testing

                arrXte = np.array(Xte)
                X_te = normalization(rescale_01(arrXte))
                Y_te = torch.Tensor(Yte.values.ravel())

                KLtr = [
                    pairwise.homogeneous_polynomial_kernel(X_tr, degree=d)
                    for d in range(1, 11)
                ] + [identity_kernel(len(Y_tr))]
                KLte = [
                    pairwise.homogeneous_polynomial_kernel(X_te,
                                                           X_tr,
                                                           degree=d)
                    for d in range(1, 11)
                ]
                KLte.append(torch.zeros(KLte[0].size()))
                print('done with kernel')
                try:
                    lam_values = [0.1, 0.2, 1]
                    best_results = {}
                    C_range = [0.1, 1]
                    for C_ch in C_range:
                        base_learner = SVC(C=C_ch)  # "soft"-margin svm
                        print(' fitted the base learner')
                        # possible lambda values for the EasyMKL algorithm
                        for lam in lam_values:
                            print('now here', lam)
                            print(' and tuning lambda for EasyMKL...', end='')
                            base_learner = SVC(C=C_ch)  # "soft"-margin svm
                            # MKLpy.model_selection.cross_val_score performs the cross validation automatically,
                            # it may returns accuracy, auc, or F1 scores
                            scores = cross_val_score(KLtr,
                                                     Y_tr,
                                                     EasyMKL(
                                                         learner=base_learner,
                                                         lam=lam),
                                                     n_folds=5,
                                                     scoring='accuracy')
                            acc = np.mean(scores)
                            if not best_results or best_results['score'] < acc:
                                best_results = {'lam': lam, 'score': acc}
                            # evaluation on the test set

                            print('done', best_results)
                            cv_dict_list[(symbol, hmm_date,
                                          label_idx)][(lam, C_ch)] = [
                                              scores, best_results
                                          ]
                            print(cv_dict_list)

                            pickle_out_filename = os.path.join(
                                mainPath,
                                "ExperimentCommonLocs/MKLFittedModels",
                                "_".join((symbol, 'model_fit_date', str(key),
                                          str(alternate_labels_nos[label_idx]),
                                          'MultiKernelSVC.pkl')))
                            print(pickle_out_filename)

                            pickle_out = open(pickle_out_filename, 'wb')

                            pickle.dump(cv_dict_list, pickle_out)
                            pickle_out.close()

                except (ValueError, TypeError, EOFError):
                    pass