Exemplo n.º 1
0
    def test_learning_curve(self):
        """Test learning curve from DB."""
        hv = Hierarchy(db_name='vec_store.sqlite', file_name='hierarchy')

        # If you want to keep datasize fixed, and vary features.
        featselect_featvar = False
        # If False above. Feature selection for ordinary ridge regression, or
        # just plain vanilla ridge.
        featselect_featconst = True

        # i is the size of featureset.
        i = 10
        lim = i + 2
        if featselect_featvar:
            # Under which interval of feature set in inspection.
            # (Varying of feature size)
            select_limit = [0, 20]
        else:
            # For which fetureset inspection is made. (Varying of data size)
            select_limit = [i - 1, i + 1]
        while i < lim:
            set_size, p_error, result, PC = hierarchy(
                hv,
                243,
                5,
                45,
                new_data=True,
                ridge=True,
                scale=True,
                globalscale=True,
                normalization=True,
                featselect_featvar=featselect_featvar,
                featselect_featconst=featselect_featconst,
                select_limit=select_limit,
                feat_sub=i)
            if not (set_size and p_error) == [] and not featselect_featvar:
                for data in result:
                    print('data size:', data[0], 'prediction error:', data[1],
                          'Omega:', data[5], 'Euclidean length:', data[2],
                          'Pearson correlation:', data[3])
                i += 1
            elif (set_size and p_error) == [] and not featselect_featvar:
                print("No subset {}".format(i))
                i += 1
                lim += 1
            if featselect_featvar:
                # Don't want to make four subpl for varying of features.
                i += 4
            select_limit = [i - 1, i + 1]
Exemplo n.º 2
0
    def test_hierarchy(self):
        """Function to test the hierarchy with ridge regression predictions."""
        # Define the hierarchy cv class method.
        train_features, train_targets, test_features, test_targets = get_data()

        hv = Hierarchy(db_name='{}/test.sqlite'.format(wkdir),
                       file_name='hierarchy')
        hv.todb(features=train_features, targets=train_targets)
        # Split the data into subsets.
        split = hv.split_index(min_split=5, max_split=25)
        self.assertEqual(len(split), 6)
        # Load data back in from save file.
        ind = hv.load_split()
        self.assertEqual(len(ind), len(split))

        # Make the predictions for each subset.
        pred = hv.split_predict(index_split=ind, predict=predict)
        self.assertTrue(len(pred[0]) == 14 and len(pred[1]) == 14)
Exemplo n.º 3
0
    def test_frequency(self):
        hv = Hierarchy(db_name='vec_store.sqlite', file_name='hierarchy')
        # Produce frequency plots between the lower and upp bound.
        for i in range(20, 22):

            select_limit = [i - 1, i + 1]
            data1 = np.empty(1, )
            data2 = np.empty(1, )
            hit1, hit2 = 0, 0
            for k in range(1, 4):
                selected_features1 = feature_frequency(
                    hv,
                    243,
                    3,
                    8,
                    new_data=True,
                    ridge=True,
                    scale=True,
                    globalscale=True,
                    normalization=True,
                    featselect_featvar=False,
                    featselect_featconst=True,
                    select_limit=select_limit,
                    feat_sub=i)
                selected_features2 = feature_frequency(
                    hv,
                    243,
                    3,
                    8,
                    smallest=True,
                    new_data=False,
                    ridge=True,
                    scale=True,
                    globalscale=True,
                    normalization=True,
                    featselect_featvar=False,
                    featselect_featconst=True,
                    select_limit=select_limit,
                    feat_sub=i)
                if bool(selected_features1):
                    hit1 += 1
                if bool(selected_features2):
                    hit2 += 1
                if bool(selected_features1) and bool(selected_features2):
                    data1 = np.concatenate(
                        (data1,
                         (list(selected_features1.items())[0])[1][0][:]),
                        axis=0)
                    data2 = np.concatenate(
                        (data2,
                         (list(selected_features2.items())[0])[1][0][:]),
                        axis=0)
            data1 = np.delete(data1, 0)
            data2 = np.delete(data2, 0)

            data_all = np.concatenate((data1, data2), axis=0)
            if len(data_all) > 0:
                bins = np.arange(min(data_all) - 2, max(data_all) + 2, 0.5)
                hist1 = np.histogram(data1, bins=bins)
                hist2 = np.histogram(data2, bins=bins)
                r1_hist1 = np.delete(hist1[0], np.where(hist1[0] == 0))
                r1_hist1 = np.divide(r1_hist1.astype('float'),
                                     len(data1)) * 100
                r2_hist1 = np.delete(
                    np.delete(hist1[1], np.where(hist1[0] == 0)), -1)

                r1_hist2 = np.delete(hist2[0], np.where(hist2[0] == 0))
                r1_hist2 = np.divide(r1_hist2.astype('float'),
                                     len(data2)) * 100
                r2_hist2 = np.delete(
                    np.delete(hist2[1], np.where(hist2[0] == 0)), -1)

                if np.shape(r1_hist2)[0] > np.shape(r1_hist1)[0]:
                    dif = np.shape(r1_hist2)[0] - np.shape(r1_hist1)[0]
                    r1_hist1 = np.concatenate((r1_hist1, np.zeros(dif)),
                                              axis=0)
                    r2_hist1 = np.concatenate((r2_hist1, np.zeros(dif)),
                                              axis=0)
                elif np.shape(r1_hist1)[0] > np.shape(r1_hist2)[0]:
                    dif = np.shape(r1_hist1)[0] - np.shape(r1_hist2)[0]
                    r1_hist2 = np.concatenate((r1_hist2, np.zeros(dif)),
                                              axis=0)
                    r2_hist2 = np.concatenate((r2_hist2, np.zeros(dif)),
                                              axis=0)
Exemplo n.º 4
0
                               test_targets)['rmse_average']
    data['size'] = len(train_targets)

    return data


# ## Cross-validation Setup <a name="cross-validation-setup"></a>
# [(Back to top)](#head)
#
# Next, we can run the cross-validation on the generated data. In order to allow for flexible storage of large numbers of data subsets, we convert the feature and target arrays to a simple db format. This is performed with the `todb()` function. After this, we split up the db index to define the subsets of data with the `split_index()` function. In this case, the maximum amount of data considered in 1000 data points and the smallest set of data will contain a minimum of 50 data points.

# In[5]:


# Initialize hierarchy cv class.
hv = Hierarchy(db_name='test.sqlite', file_name='hierarchy')
# Convert features and targets to simple db format.
hv.todb(features=features, targets=targets)
# Split the data into subsets.
ind = hv.split_index(min_split=50, max_split=1000)


# ## Prediction Analysis <a name="prediction-analysis"></a>
# [(Back to top)](#head)
#
# The analysis is first performed with ridge regression. Predictions are made for all subsets of data and the averaged errors plotted against the data size. What is typically observed is that as the size of the data subset increases the error decreases.

# In[ ]:


# Make the predictions for each subset.