Пример #1
0
    def test_single_variate_single_dimension(self):
        # this is a single-variable expressed as a 1-d numpy array
        # (each element of the array is an instance)
        X = np.array([1, 2, 3, 4, 5])
        B = 2
        s = np.mean

        bootstrap = Bootstrap.Bootstrap(X, s, B)
        bootstrap.run()

        assert bootstrap.N == 5
        assert bootstrap.B == B
Пример #2
0
    def test_07a(self):

        X = my_data.get_data()
        #print(X)
        s = self.ratio_first_eigenvector_to_sum
        B = 200
        #B = 10

        # explore the empirical data...
        covariance_matrix = np.cov(X, bias=True, rowvar=False)
        w, v = LA.eig(covariance_matrix)
        v = np.transpose(v)
        print("--- empirical data - shape ---")
        print(X.shape)
        print("--- empirical data - covariance matrix ---")
        print(covariance_matrix)
        print("--- empirical data - eigenvalues ---")
        print(w)
        print("--- empirical data - eigenvectors ---")
        print(v)

        # prepare to collect data - empty 3-d array
        num_attributes = X.shape[1]
        self.eigenvectors = np.empty([0, num_attributes, num_attributes])

        # run the bootstrap
        print("--- run the bootstrap ---")
        bootstrap = Bootstrap.Bootstrap(X, s, B)
        bootstrap.add_callback(self.my_callback)
        [std, sem] = bootstrap.run()

        print("standard deviation:")
        print(std)
        print("standard error of the mean")
        print(sem)
        #assert(False)

        # investigate the results
        # plot the theta_stars (the measure) from the bootstrap replications
        # the expectation is this is somewhat gaussian (long tails are not acceptable)
        print("--- results from bootstrap ---")
        my_charts.plot_histogram(bootstrap.theta_star, "Count of Occurrences",
                                 "Ratio: eigenV1/sum(eigen)",
                                 "Histogram - Count of EigenV1/sum")
        #assert(False)

        # plot the first two principal component vectors using box-and-whisker
        # we are looking for (lack of) variability
        print("first two principal components")
        print(self.eigenvectors.shape)

        my_charts.plot_box_and_whisker()
Пример #3
0
    def test_treatment(self):

        treatment = np.array([94, 197, 16, 38, 99, 141, 23])
        s = self.my_s
        B = 100

        # look at the original data...
        print("Treatment sample size: ", treatment.shape[0], " mean: ", np.mean(treatment), "sem: ", stats.sem(treatment))

        # run the bootstrap
        ### this is incorrect - we actually should run boot strap
        # on the DIFFERENCE btween Treatment and control
        bootstrap = Bootstrap.Bootstrap(treatment, s, B)
        [std, sem] = bootstrap.run()
Пример #4
0
    def test_single_variate(self):
        # this is a single-variable expressed as a 2-d numpy array
        # (the attribute is column 0, rows are instances)
        num_instances = 100
        num_attributes = 1

        X = np.random.randint(5, size=(num_instances, num_attributes))
        s = np.mean
        B = 2

        bootstrap = Bootstrap.Bootstrap(X, s, B)
        bootstrap.run()

        assert bootstrap.N == num_instances
        assert bootstrap.B == B
Пример #5
0
    def test_multi_variate(self):
        # this is a three-variable expressed as a 2-d numpy array
        # (the attributes are columns, rows are instances)
        num_instances = 6
        num_attributes = 3

        X = np.random.randint(5, size=(num_instances, num_attributes))
        s = np.mean
        B = 2

        bootstrap = Bootstrap.Bootstrap(X, s, B)
        bootstrap.run()

        assert bootstrap.N == num_instances
        assert bootstrap.B == B
Пример #6
0
 def generate_bootstraps(self):  # bootstrapping
     for n in range(0, self.nTree):
         b = Bootstrap()
         b.generate(self.original_dataset)
         self.bootstraps.append(b)
     return self.bootstraps
Пример #7
0
BOOTSTRAP_N = 20  # number of bootstrap samples (YOU CAN PLAY AROUND WITH THIS)
DATA_START_INDEX = 1  # account for df's named index column 0 (DON'T CHANGE THIS UNLESS YOUR DATASET NEEDS IT)
DO_K_SWEEP = True  # switch to do sweep of K values using K means to find optimal K
OPTIMAL_K = 3  # Iris dataset has 3 clusters (ground truth), change this for different datasets

# import data
iris = datasets.load_iris()
df = pd.DataFrame(data=np.c_[iris['data']], columns=iris['feature_names'])

# prepare data (add index column 'flower')
prep = Prepare('flower', len(df)).names_join(df)
df = prep['df']
labels = prep['labels']

# generate bootstrap samples
bts = Bootstrap(df, BOOTSTRAP_SIZE, BOOTSTRAP_N).get_bootstraps()

# determine optimal clustering K
kmeans = Bootstrap.kmeans_bootstrap(bts, DO_K_SWEEP, BOOTSTRAP_N,
                                    DATA_START_INDEX, OPTIMAL_K, MAX_K)

# max k determined above becomes optimal k
gmm = RunAlgos(3, BOOTSTRAP_N, DATA_START_INDEX, bts, kmeans).run_GMM()
agglomerative = RunAlgos(3, BOOTSTRAP_N, DATA_START_INDEX, bts,
                         kmeans).run_Agglomerative()
kmeans_ = RunAlgos(3, BOOTSTRAP_N, DATA_START_INDEX, bts, kmeans).run_KMeans()
# consensus clustering
cc_init = Consensus(kmeans_, gmm, agglomerative, bts, df, DATA_START_INDEX, 3,
                    labels)
mats = cc_init.combine_results()
Пример #8
0
def main(args):
    import Bootstrap as app
    global app
    app = app.Bootstrap(args)
    app.exec_()