示例#1
0
def converter_multidimensionalscaling_modular(data_fname):
    try:
        import numpy
        from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile

        features = RealFeatures(CSVFile(data_fname))

        distance_before = EuclideanDistance()
        distance_before.init(features, features)

        converter = MultidimensionalScaling()
        converter.set_target_dim(2)
        converter.set_landmark(False)
        embedding = converter.apply(features)

        distance_after = EuclideanDistance()
        distance_after.init(embedding, embedding)

        distance_matrix_after = distance_after.get_distance_matrix()
        distance_matrix_before = distance_before.get_distance_matrix()

        return numpy.linalg.norm(distance_matrix_after -
                                 distance_matrix_before) / numpy.linalg.norm(
                                     distance_matrix_before) < 1e-6
    except ImportError:
        print('No Eigen3 available')
        def RunHierarchicalShogun():
            totalTimer = Timer()

            try:
                # Load input dataset.
                Log.Info("Loading dataset", self.verbose)
                dataPoints = np.genfromtxt(self.dataset, delimiter=',')
                dataFeat = RealFeatures(dataPoints.T)

                # Gather all the parameters.
                if "merges" in options:
                    merges = int(options.pop("merges"))
                else:
                    Log.Fatal(
                        "Missing parameter: number of merges to be done while clustering bottom up"
                    )
                    raise Exception("missing parameter")

                # if distance metric specified, select it, otherwise Euclidean distance by default
                if "distance" in options:
                    distance = str(options.pop("distance"))
                    distance = distance.lower()
                    if distance not in [
                            "euclidean", "cosine", "manhattan", "chebyshev"
                    ]:
                        Log.Fatal(
                            "Distance option should be one of Euclidean, Manhattan, Cosine or Chebyshev only"
                        )
                        raise Exception("unknown distance metric")
                    if distance == "euclidean":
                        distance = EuclideanDistance(dataFeat, dataFeat)
                    elif distance == "manhattan":
                        distance = ManhattanMetric(dataFeat, dataFeat)
                    elif distance == "cosine":
                        distance = CosineDistance(dataFeat, dataFeat)
                    elif distance == "chebyshev":
                        distance = ChebyshewMetric(dataFeat, dataFeat)
                else:
                    # distance option not specified, default to Euclidean distance
                    distance = EuclideanDistance(dataFeat, dataFeat)

                if (len(options) > 0):
                    Log.Fatal("Unknown options: " + str(options))
                    raise Exception("unknown options")

                # Create the Hierarchical object and perform Hierarchical clustering.
                with totalTimer:
                    model = Hierarchical(merges, distance)
                    model.train()

                merge_distances = model.get_merge_distances()
                cluster_pairs = model.get_cluster_pairs()

            except Exception as e:
                Log.Info("Exception: " + str(e))
                return [-1]

            return [totalTimer.ElapsedTime(), merge_distances, cluster_pairs]
示例#3
0
    def predict(self, image):
        """
        Predict the face
        """
        #image as row
        imageAsRow = np.asarray(
            image.reshape(image.shape[0] * image.shape[1], 1), np.float64)
        #project inthe subspace
        p = self.pca.apply_to_feature_vector(
            RealFeatures(imageAsRow).get_feature_vector(0))

        #min value to find the face
        minDist = 1e100
        #class
        minClass = -1
        #search which face is the best match
        for sampleIdx in range(len(self._projections)):
            test = RealFeatures(np.asmatrix(p, np.float64).T)
            projection = RealFeatures(
                np.asmatrix(self._projections[sampleIdx], np.float64).T)
            dist = EuclideanDistance(test, projection).distance(0, 0)

            if (dist < minDist):
                minDist = dist
                minClass = self._labels[sampleIdx]

        return minClass
示例#4
0
        def RunAllKnnShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the query
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    referenceData = np.genfromtxt(self.dataset[0],
                                                  delimiter=',')
                    queryData = np.genfromtxt(self.dataset[1], delimiter=',')
                    queryFeat = RealFeatures(queryFeat.T)
                else:
                    referenceData = np.genfromtxt(self.dataset, delimiter=',')

                # Labels are the last row of the dataset.
                labels = MulticlassLabels(
                    referenceData[:, (referenceData.shape[1] - 1)])
                referenceData = referenceData[:, :-1]

                with totalTimer:
                    # Get all the parameters.
                    k = re.search("-k (\d+)", options)
                    if not k:
                        Log.Fatal(
                            "Required option: Number of furthest neighbors to find."
                        )
                        q.put(-1)
                        return -1
                    else:
                        k = int(k.group(1))
                        if (k < 1 or k > referenceData.shape[0]):
                            Log.Fatal("Invalid k: " + k.group(1) +
                                      "; must be greater than 0" +
                                      " and less or equal than " +
                                      str(referenceData.shape[0]))
                            q.put(-1)
                            return -1

                    referenceFeat = RealFeatures(referenceData.T)
                    distance = EuclideanDistance(referenceFeat, referenceFeat)

                    # Perform All K-Nearest-Neighbors.
                    model = SKNN(k, distance, labels)
                    model.train()

                    if len(self.dataset) == 2:
                        out = model.apply(queryFeat).get_labels()
                    else:
                        out = model.apply(referenceFeat).get_labels()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
示例#5
0
    def RunKMeansShogun(q):
      totalTimer = Timer()

      # Load input dataset.
      # If the dataset contains two files then the second file is the centroids
      # file.
      Log.Info("Loading dataset", self.verbose)
      if len(self.dataset) == 2:
        data = np.genfromtxt(self.dataset[0], delimiter=',')
        centroids = np.genfromtxt(self.dataset[1], delimiter=',')
      else:
        data = np.genfromtxt(self.dataset[0], delimiter=',')

      # Gather parameters.
      clusters = re.search("-c (\d+)", options)
      maxIterations = re.search("-m (\d+)", options)
      seed = re.search("-s (\d+)", options)

      # Now do validation of options.
      if not clusters and len(self.dataset) != 2:
        Log.Fatal("Required option: Number of clusters or cluster locations.")
        q.put(-1)
        return -1
      elif (not clusters or int(clusters.group(1)) < 1) and len(self.dataset) != 2:
        Log.Fatal("Invalid number of clusters requested! Must be greater than"
            + " or equal to 1.")
        q.put(-1)
        return -1

      m = 1000 if not maxIterations else int(maxIterations.group(1))


      if seed:
        Math_init_random(seed.group(1))
      try:
        dataFeat = RealFeatures(data.T)
        distance = EuclideanDistance(dataFeat, dataFeat)

        # Create the K-Means object and perform K-Means clustering.
        with totalTimer:
          if len(self.dataset) == 2:
            model = KMeans(int(clusters.group(1)), distance, RealFeatures(centroids))
          else:
            model = KMeans(int(clusters.group(1)), distance)

          model.set_mbKMeans_iter(m)
          model.train()

          labels = model.apply().get_labels()
          centers = model.get_cluster_centers()
      except Exception as e:
        print(e)
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
示例#6
0
        def RunKMeansShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the centroids
            # file.
            Log.Info("Loading dataset", self.verbose)
            if len(self.dataset) == 2:
                data = np.genfromtxt(self.dataset[0], delimiter=',')
                centroids = np.genfromtxt(self.dataset[1], delimiter=',')
            else:
                data = np.genfromtxt(self.dataset[0], delimiter=',')

            # Gather parameters.
            if "clusters" in options:
                clusters = int(options.pop("clusters"))
            elif len(self.dataset) != 2:
                Log.Fatal(
                    "Required option: Number of clusters or cluster locations."
                )
                q.put(-1)
                return -1
            if "max_iterations" in options:
                maxIterations = int(options.pop("max_iterations"))
            seed = None
            if "seed" in options:
                seed = int(options.pop("seed"))

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            if seed:
                Math_init_random(seed)
            try:
                dataFeat = RealFeatures(data.T)
                distance = EuclideanDistance(dataFeat, dataFeat)

                # Create the K-Means object and perform K-Means clustering.
                with totalTimer:
                    if len(self.dataset) == 2:
                        model = KMeans(clusters, distance, centroids.T)
                    else:
                        model = KMeans(clusters, distance)

                    model.set_max_iter(m)
                    model.train()

                    labels = model.apply().get_labels()
                    centers = model.get_cluster_centers()
            except Exception as e:
                print(e)
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
    def load_train(self):
        ims, labels = self.load( self.test_images, self.test_labels)

        self.test_images = ims
        self.test_labels = labels
        labels_numbers = MulticlassLabels(self.test_labels)
        feats  = RealFeatures(self.test_images.T)
        dist = EuclideanDistance()
        self.knn = KNN(self.k, dist, labels_numbers)
        self.knn.train(feats)
def kernel_cauchy_modular(train_fname=traindat, test_fname=testdat, sigma=1.0):
    from modshogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance(feats_train, feats_train)
    kernel = CauchyKernel(feats_train, feats_train, sigma, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#9
0
def clustering_hierarchical_modular (fm_train=traindat,merges=3):
	from modshogun import EuclideanDistance, RealFeatures, Hierarchical, CSVFile

	feats_train=RealFeatures(CSVFile(fm_train))
	distance=EuclideanDistance(feats_train, feats_train)

	hierarchical=Hierarchical(merges, distance)
	hierarchical.train()

	out_distance = hierarchical.get_merge_distances()
	out_cluster = hierarchical.get_cluster_pairs()

	return hierarchical,out_distance,out_cluster
示例#10
0
def knn(train_features, train_labels, test_features, test_labels, k=1):
    from modshogun import KNN, MulticlassAccuracy, EuclideanDistance

    distance = EuclideanDistance(train_features, train_features)
    knn = KNN(k, distance, train_labels)
    knn.train()
    train_output = knn.apply()
    test_output = knn.apply(test_features)
    evaluator = MulticlassAccuracy()
    print 'KNN training error is %.4f' % (
        (1 - evaluator.evaluate(train_output, train_labels)) * 100)
    print 'KNN test error is %.4f' % (
        (1 - evaluator.evaluate(test_output, test_labels)) * 100)
示例#11
0
def run_clustering(data, k):
    from modshogun import KMeans
    from modshogun import EuclideanDistance
    from modshogun import RealFeatures

    fea = RealFeatures(data)
    distance = EuclideanDistance(fea, fea)
    kmeans = KMeans(k, distance)

    #print("Running clustering...")
    kmeans.train()

    return kmeans.get_cluster_centers()
示例#12
0
def kernel_exponential_modular (train_fname=traindat,test_fname=testdat, tau_coef=1.0):
	from modshogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance = EuclideanDistance(feats_train, feats_train)
	kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#13
0
    def BuildModel(self, data, labels, options):
        # Get all the parameters.
        n = re.search("-n (\d+)", options)

        self.n_neighbors = 5 if not n else int(n.group(1))

        distance = EuclideanDistance(data, data)
        from modshogun import KNN_KDTREE
        knc = KNN(self.n_neighbors, distance, labels, KNN_KDTREE)
        knc.set_leaf_size(30)
        knc.train()

        return knc
示例#14
0
def assign_labels(data, centroids, ncenters):
    from modshogun import EuclideanDistance
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import KNN
    from numpy import arange

    labels = MulticlassLabels(arange(0., ncenters))
    fea = RealFeatures(data)
    fea_centroids = RealFeatures(centroids)
    distance = EuclideanDistance(fea_centroids, fea_centroids)
    knn = KNN(1, distance, labels)
    knn.train()
    return knn.apply(fea)
示例#15
0
def clustering_kmeans_modular (fm_train=traindat,k=3):
	from modshogun import EuclideanDistance, RealFeatures, KMeans, Math_init_random, CSVFile
	Math_init_random(17)

	feats_train=RealFeatures(CSVFile(fm_train))
	distance=EuclideanDistance(feats_train, feats_train)

	kmeans=KMeans(k, distance)
	kmeans.train()

	out_centers = kmeans.get_cluster_centers()
	kmeans.get_radiuses()

	return out_centers, kmeans
示例#16
0
def kernel_rationalquadratic_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
	from modshogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def distance_normsquared_modular(train_fname=traindat, test_fname=testdat):
    from modshogun import RealFeatures, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance(feats_train, feats_train)
    distance.set_disable_sqrt(True)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
示例#18
0
def kernel_power_modular(train_fname=traindat, test_fname=testdat, degree=2.0):
    from modshogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance(feats_train, feats_train)

    kernel = PowerKernel(feats_train, feats_train, degree, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#19
0
def kernel_distance_modular(train_fname=traindat,
                            test_fname=testdat,
                            width=1.7):
    from modshogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance()
    kernel = DistanceKernel(feats_train, feats_test, width, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#20
0
def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0):
	from modshogun import RealFeatures
	from modshogun import MultiquadricKernel
	from modshogun import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#21
0
    def BuildModel(self, data, labels, options):
        # Get all the parameters.
        if "k" in options:
            n_neighbors = int(options.pop("k"))
        else:
            Log.Fatal("Required parameter 'k' not specified!")
            raise Exception("missing parameter")

        if len(options) > 0:
            Log.Fatal("Unknown parameters: " + str(options))
            raise Exception("unknown parameters")

        distance = EuclideanDistance(data, data)
        knc = KNN(self.n_neighbors, distance, labels, KNN_KDTREE)
        knc.train()

        return knc
示例#22
0
def knn_classify(traindat, testdat, k=3):
    from modshogun import KNN, MulticlassAccuracy, EuclideanDistance

    train_features, train_labels = traindat.features, traindat.labels

    distance = EuclideanDistance(train_features, train_features)
    knn = KNN(k, distance, train_labels)
    knn.train()

    test_features, test_labels = testdat.features, testdat.labels

    predicted_labels = knn.apply(test_features)
    evaluator = MulticlassAccuracy()
    acc = evaluator.evaluate(predicted_labels, test_labels)
    err = 1 - acc

    return err
示例#23
0
def distance_director_euclidean_modular(fm_train_real=traindat,
                                        fm_test_real=testdat,
                                        scale=1.2):
    try:
        from modshogun import DirectorDistance
    except ImportError:
        print("recompile shogun with --enable-swig-directors")
        return

    class DirectorEuclideanDistance(DirectorDistance):
        def __init__(self):
            DirectorDistance.__init__(self, True)

        def distance_function(self, idx_a, idx_b):
            seq1 = self.get_lhs().get_feature_vector(idx_a)
            seq2 = self.get_rhs().get_feature_vector(idx_b)
            return numpy.linalg.norm(seq1 - seq2)

    from modshogun import EuclideanDistance
    from modshogun import Time

    feats_train = RealFeatures(fm_train_real)
    #feats_train.io.set_loglevel(MSG_DEBUG)
    feats_train.parallel.set_num_threads(1)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance()
    distance.init(feats_train, feats_test)

    ddistance = DirectorEuclideanDistance()
    ddistance.init(feats_train, feats_test)

    #print  "dm_train"
    t = Time()
    dm_train = distance.get_distance_matrix()
    #t1=t.cur_time_diff(True)

    #print  "ddm_train"
    t = Time()
    ddm_train = ddistance.get_distance_matrix()
    #t2=t.cur_time_diff(True)

    #print "dm_train", dm_train
    #print "ddm_train", ddm_train

    return dm_train, ddm_train
示例#24
0
def kernel_wave_modular(fm_train_real=traindat,
                        fm_test_real=testdat,
                        theta=1.0):
    from modshogun import RealFeatures
    from modshogun import WaveKernel
    from modshogun import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance(feats_train, feats_train)

    kernel = WaveKernel(feats_train, feats_train, theta, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#25
0
def kernel_tstudent_modular(fm_train_real=traindat,
                            fm_test_real=testdat,
                            degree=2.0):
    from modshogun import RealFeatures
    from modshogun import TStudentKernel
    from modshogun import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance(feats_train, feats_train)

    kernel = TStudentKernel(feats_train, feats_train, degree, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def classifier_knn_modular(train_fname=traindat,
                           test_fname=testdat,
                           label_train_fname=label_traindat,
                           k=3):
    from modshogun import RealFeatures, MulticlassLabels, KNN, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    distance = EuclideanDistance(feats_train, feats_train)

    labels = MulticlassLabels(CSVFile(label_train_fname))

    knn = KNN(k, distance, labels)
    knn_train = knn.train()
    output = knn.apply(feats_test).get_labels()
    multiple_k = knn.classify_for_multiple_k()

    return knn, knn_train, output, multiple_k
示例#27
0
    def KNNAccuracy(distance, data, k, flag):
        transformedData = np.dot(data[0], distance.T)
        feat = RealFeatures(transformedData.T)
        labels = MulticlassLabels(data[1].astype(np.float64))
        dist = EuclideanDistance(feat, feat)
        knn = KNN(k + 1, dist, labels)
        knn.train(feat)
        # Get nearest neighbors.
        nn = knn.nearest_neighbors()
        nn = np.delete(nn, 0, 0)
        # Compute unique labels.
        uniqueLabels = np.unique(labels)
        # Keep count correct predictions.
        count = 0
        # Normalize labels
        for i in range(data[0].shape[0]):
            for j in range(len(uniqueLabels)):
                if (labels[i] == uniqueLabels[j]):
                    labels[i] = j
                    break

        for i in range(nn.shape[1]):
            mapLabels = [0 for x in range(len(uniqueLabels))]
            for j in range(nn.shape[0]):
                if (flag):
                    distPoints = np.linalg.norm(data[0][nn[j][i], :] -
                                                data[0][i, :])
                    # Add constant factor of 1 incase two points overlap
                    mapLabels[int(labels[nn[j, i]])] += 1 / (distPoints + 1)**2
                else:
                    # Subtract a variable factor to avoid draw condition without
                    # affecting actual result.
                    mapLabels[int(labels[nn[j, i]])] += 1 - j * 1e-8
            maxInd = np.argmax(mapLabels)
            if (maxInd == labels[i]):
                count += 1
        accuracy = (count / nn.shape[1]) * 100
        return accuracy
示例#28
0
def hsic_graphical():
    # parameters, change to get different results
    m = 250
    difference = 3

    # setting the angle lower makes a harder test
    angle = pi / 30

    # number of samples taken from null and alternative distribution
    num_null_samples = 500

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(m, difference, angle)

    # create shogun feature representation
    features_x = RealFeatures(array([data[0]]))
    features_y = RealFeatures(array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = int32(array([x for x in range(features_x.get_num_vectors())
                          ]))  # numpy
    subset = random.permutation(subset)  # numpy permutation
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = np.median(distances)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = np.median(distances)
    sigma_y = median_distance**2
    print "median distance for Gaussian kernel on x:", sigma_x
    print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    # create hsic instance. Note that this is a convienience constructor which copies
    # feature data. features_x and features_y are not these used in hsic.
    # This is only for user-friendlyness. Usually, its ok to do this.
    # Below, the alternative distribution is sampled, which means
    # that new feature objects have to be created in each iteration (slow)
    # However, normally, the alternative distribution is not sampled
    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # sample alternative distribution
    alt_samples = zeros(num_null_samples)
    for i in range(len(alt_samples)):
        data = DataGenerator.generate_sym_mix_gauss(m, difference, angle)
        features_x.set_feature_matrix(array([data[0]]))
        features_y.set_feature_matrix(array([data[1]]))

        # re-create hsic instance everytime since feature objects are copied due to
        # useage of convienience constructor
        hsic = HSIC(kernel_x, kernel_y, features_x, features_y)
        alt_samples[i] = hsic.compute_statistic()

    # sample from null distribution
    # permutation, biased statistic
    hsic.set_null_approximation_method(PERMUTATION)
    hsic.set_num_null_samples(num_null_samples)
    null_samples_boot = hsic.sample_null()

    # fit gamma distribution, biased statistic
    hsic.set_null_approximation_method(HSIC_GAMMA)
    gamma_params = hsic.fit_null_gamma()
    # sample gamma with parameters
    null_samples_gamma = array([
        gamma(gamma_params[0], gamma_params[1])
        for _ in range(num_null_samples)
    ])

    # plot
    figure()

    # plot data x and y
    subplot(2, 2, 1)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    grid(True)
    plot(data[0], data[1], 'o')
    title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m))
    xlabel('$x$')
    ylabel('$y$')

    # compute threshold for test level
    alpha = 0.05
    null_samples_boot.sort()
    null_samples_gamma.sort()
    thresh_boot = null_samples_boot[floor(
        len(null_samples_boot) * (1 - alpha))]
    thresh_gamma = null_samples_gamma[floor(
        len(null_samples_gamma) * (1 - alpha))]

    type_one_error_boot = sum(
        null_samples_boot < thresh_boot) / float(num_null_samples)
    type_one_error_gamma = sum(
        null_samples_gamma < thresh_boot) / float(num_null_samples)

    # plot alternative distribution with threshold
    subplot(2, 2, 2)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(alt_samples, 20, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples)
    title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

    # compute range for all null distribution histograms
    hist_range = [
        min([min(null_samples_boot),
             min(null_samples_gamma)]),
        max([max(null_samples_boot),
             max(null_samples_gamma)])
    ]

    # plot null distribution with threshold
    subplot(2, 2, 3)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(null_samples_boot, 20, range=hist_range, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    title('Sampled Null Dist.\n' + 'Type I error is ' +
          str(type_one_error_boot))

    # plot null distribution gamma
    subplot(2, 2, 4)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(null_samples_gamma, 20, range=hist_range, normed=True)
    axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
    title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma))
    grid(True)

    # pull plots a bit apart
    subplots_adjust(hspace=0.5)
    subplots_adjust(wspace=0.5)
示例#29
0
def statistics_hsic(n, difference, angle):
    from modshogun import RealFeatures
    from modshogun import DataGenerator
    from modshogun import GaussianKernel
    from modshogun import HSIC
    from modshogun import PERMUTATION, HSIC_GAMMA
    from modshogun import EuclideanDistance
    from modshogun import Statistics, Math

    # for reproducable results (the numpy one might not be reproducible across
    # different OS/Python-distributions
    Math.init_random(1)
    np.random.seed(1)

    # note that the HSIC has to store kernel matrices
    # which upper bounds the sample size

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(n, difference, angle)
    #plot(data[0], data[1], 'x');show()

    # create shogun feature representation
    features_x = RealFeatures(np.array([data[0]]))
    features_y = RealFeatures(np.array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = np.random.permutation(features_x.get_num_vectors()).astype(
        np.int32)
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = np.median(distances)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = np.median(distances)
    sigma_y = median_distance**2
    #print "median distance for Gaussian kernel on x:", sigma_x
    #print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = hsic.compute_statistic()
    #print "HSIC:", statistic
    alpha = 0.05

    #print "computing p-value using sampling null"
    hsic.set_null_approximation_method(PERMUTATION)
    # normally, at least 250 iterations should be done, but that takes long
    hsic.set_num_null_samples(100)
    # sampling null allows usage of unbiased or biased statistic
    p_value_boot = hsic.compute_p_value(statistic)
    thresh_boot = hsic.compute_threshold(alpha)
    #print "p_value:", p_value_boot
    #print "threshold for 0.05 alpha:", thresh_boot
    #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

    #print "computing p-value using gamma method"
    hsic.set_null_approximation_method(HSIC_GAMMA)
    p_value_gamma = hsic.compute_p_value(statistic)
    thresh_gamma = hsic.compute_threshold(alpha)
    #print "p_value:", p_value_gamma
    #print "threshold for 0.05 alpha:", thresh_gamma
    #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # sampling null, biased statistic
    #print "sampling null distribution using sample_null"
    hsic.set_null_approximation_method(PERMUTATION)
    hsic.set_num_null_samples(100)
    null_samples = hsic.sample_null()
    #print "null mean:", np.mean(null_samples)
    #print "null variance:", np.var(null_samples)
    #hist(null_samples, 100); show()

    return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
示例#30
0
        xs = [x[i, 0], x[nn[1, i], 0]]
        ys = [x[i, 1], x[nn[1, i], 1]]
        axis.plot(xs, ys, COLS[int(y[i])])


figure, axarr = pyplot.subplots(3, 1)
x, y = sandwich_data()

features = RealFeatures(x.T)
labels = MulticlassLabels(y)

print('%d vectors with %d features' %
      (features.get_num_vectors(), features.get_num_features()))
assert (features.get_num_vectors() == labels.get_num_labels())

distance = EuclideanDistance(features, features)
k = 2
knn = KNN(k, distance, labels)

plot_data(x, y, axarr[0])
plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0])
axarr[0].set_aspect('equal')
axarr[0].set_xlim(-6, 4)
axarr[0].set_ylim(-3, 2)

lmnn = LMNN(features, labels, k)
lmnn.set_maxiter(10000)
lmnn.train()
L = lmnn.get_linear_transform()
knn.set_distance(lmnn.get_distance())