Exemplo n.º 1
0
        def RunQDAShogun():
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            try:
                # Load train and test dataset.
                trainData = np.genfromtxt(self.dataset[0], delimiter=',')
                trainFeat = modshogun.RealFeatures(trainData[:, :-1].T)

                if len(self.dataset) == 2:
                    testSet = np.genfromtxt(self.dataset[1], delimiter=',')
                    testFeat = modshogun.RealFeatures(testData.T)

                if len(options) > 0:
                    Log.Fatal("Unknown parameters: " + str(options))
                    raise Exception("unknown parameters")

                # Labels are the last row of the training set.
                labels = modshogun.MulticlassLabels(
                    trainData[:, (trainData.shape[1] - 1)])

                with totalTimer:

                    model = modshogun.QDA(trainFeat, labels)
                    model.train()
                    if len(self.dataset) == 2:
                        model.apply_multiclass(testFeat).get_labels()
            except Exception as e:
                return -1

            return totalTimer.ElapsedTime()
Exemplo n.º 2
0
def get_CosineDistance(xm, ym):
    # CosineDistance by shogun
    xm = np.array(xm).T
    ym = np.array(ym).T
    fxm = modshogun.RealFeatures(xm)
    fym = modshogun.RealFeatures(ym)
    return modshogun.CosineDistance(fxm, fym).get_distance_matrix()
Exemplo n.º 3
0
    def RunMetrics(self, options):
        Log.Info("Perform QDA.", self.verbose)

        results = self.QDAShogun(options)
        if results < 0:
            return results

        metrics = {'Runtime': results}

        if len(self.dataset) >= 3:
            trainData, labels = SplitTrainData(self.dataset)
            testData = LoadDataset(self.dataset[1])
            truelabels = LoadDataset(self.dataset[2])

            model = modshogun.QDA(modshogun.RealFeatures(trainData.T),
                                  modshogun.MulticlassLabels(labels))
            model.train()
            predictions = model.apply(modshogun.RealFeatures(
                testData.T)).get_labels()

            confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
            metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
            metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
            metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
            metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
            metrics['MSE'] = Metrics.SimpleMeanSquaredError(
                truelabels, predictions)

        return metrics
Exemplo n.º 4
0
  def RunMetrics(self, options):

    if len(self.dataset) >= 3:
     
      trainData, labels = SplitTrainData(self.dataset)
      
      testData = LoadDataset(self.dataset[1])
      truelabels = LoadDataset(self.dataset[2])

      model = modshogun.QDA(modshogun.RealFeatures(trainData.T),modshogun.MulticlassLabels(labels))
      model.train()
      predictions = model.apply(modshogun.RealFeatures(testData.T)).get_labels()

      # Datastructure to store the results.
      metrics = {}

      confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
      metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
      metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
      metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
      metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
      metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
      return metrics
    else:
      Log.Fatal("This method requires three datasets!")    
Exemplo n.º 5
0
        def RunQDAShogun(q):
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            try:
                # Load train and test dataset.
                trainData = np.genfromtxt(self.dataset[0], delimiter=',')
                trainFeat = modshogun.RealFeatures(trainData[:, :-1].T)

                if len(self.dataset) == 2:
                    testSet = np.genfromtxt(self.dataset[1], delimiter=',')
                    testFeat = modshogun.RealFeatures(testData.T)

                # Labels are the last row of the training set.
                labels = modshogun.MulticlassLabels(
                    trainData[:, (trainData.shape[1] - 1)])

                with totalTimer:

                    model = modshogun.QDA(trainFeat, labels)
                    model.train()
                    if len(self.dataset) == 2:
                        model.apply(testFeat).get_labels()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Exemplo n.º 6
0
def _read_toy_data(request):
    y_set = []
    x_set = []
    x_set_induc = []
    points = []
    points_induc = []
    model_sel_error = False
    toy_data = json.loads(request.POST['point_set'])
    for pt in toy_data:
        if int(pt['label']) == 1:
            points.append(pt)
        elif pt['label'] == -1:
            points_induc.append(pt)

    for pt in points:
        y_set.append(float(pt["y"]))
        x_set.append(float(pt["x"]))

    for pt in points_induc:
        x_set_induc.append(float(pt["x"]))

    noise_level = float(request.POST['noise_level'])
    scale = float(request.POST['scale'])
    inf = request.POST['inf']
    domain = json.loads(request.POST['axis_domain'])

    labels = np.array(y_set, dtype=np.float64)
    num = len(x_set)
    if num == 0:
        raise Http404
    examples = np.zeros((1, num))
    for i in xrange(num):
        examples[0, i] = x_set[i]
    feat_train = sg.RealFeatures(examples)
    labels = sg.RegressionLabels(labels)

    #Get inducing points
    num_induc = len(x_set_induc)

    if num_induc != 0:
        examples_induc = np.zeros((1, num_induc))
        for i in xrange(num_induc):
            examples_induc[0, i] = x_set_induc[i]
        feat_train_induc = sg.RealFeatures(examples_induc)
    elif num_induc == 0:
        feat_train_induc = None

    kernel = get_kernel(request, feat_train)
    try:
        learn = request.POST["learn"]
    except:
        raise ValueError("Argument Error")

    if int(feat_train.get_num_vectors()) > 100 and learn == "ML2":
        model_sel_error = True

    return (feat_train, labels, noise_level, scale, kernel, domain, learn,
            feat_train_induc, inf), model_sel_error
Exemplo n.º 7
0
 def fit(self, x):
     x = np.array(x).T
     features_train = modshogun.RealFeatures(x)
     distance = self.distance(features_train, features_train)
     self.kmeans = modshogun.KMeans(self.k, distance)
     self.kmeans.train()
     self.cluster_centers_ = self.kmeans.get_cluster_centers().T
     kcc = modshogun.RealFeatures(self.cluster_centers_.T)
     discc = self.distance(kcc, features_train).get_distance_matrix()
     self.labels_ = np.copy(discc.argsort(axis=0)[0, :]).T
     return self
Exemplo n.º 8
0
def linear_mmd_test(X, Y, null_samples=1000):
    mmd = sg.QuadraticTimeMMD()
    mmd.set_p(sg.RealFeatures(X.T.astype(np.float64)))
    mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64)))
    mmd.set_kernel(sg.LinearKernel())

    mmd.set_num_null_samples(null_samples)
    samps = mmd.sample_null()
    stat = mmd.compute_statistic()

    p_val = np.mean(stat <= samps)
    return p_val, stat, samps
Exemplo n.º 9
0
def regress_dump(request):
    try:
        data_set = request.POST['data_set']
        feature = request.POST['feature']

        temp_feats = sg.RealFeatures(
            sg.CSVFile(REGRESS_DATA_DIR + REGRESS_DATA_SET[data_set]))
        labels = sg.RegressionLabels(
            sg.CSVFile(REGRESS_DATA_DIR + REGRESS_LABELS[data_set]))
        lab = labels.get_labels()

        #rescale to 0...1
        preproc = sg.RescaleFeatures()
        preproc.init(temp_feats)
        temp_feats.add_preprocessor(preproc)
        temp_feats.apply_preprocessor(True)
        mat = temp_feats.get_feature_matrix()

        if feature == 'CRIM':
            feat = mat[0]
        elif feature == 'DIS':
            feat = mat[7]
        elif feature == 'INDUS':
            feat = mat[2]
        elif feature == 'LSTAT':
            feat = mat[12]
    except:
        raise Http404

    toy_data = []
    for i in xrange(len(feat)):
        toy_data.append({'x': feat[i], 'y': lab[i], 'label': float(0)})
    return HttpResponse(json.dumps(toy_data))
Exemplo n.º 10
0
def get_binary_features(request):
    try:
        point_set_raw = json.loads(request.POST['point_set'])
    except:
        raise ValueError("cannot read click pts")
    class_a_point_set = []
    class_b_point_set = []
    for point in point_set_raw:
        if point['label'] == 1:
            class_a_point_set.append([point['x'], point['y']])
        else:
            class_b_point_set.append([point['x'], point['y']])
    class_a = np.transpose(np.array(class_a_point_set, dtype=float))
    class_b = np.transpose(np.array(class_b_point_set, dtype=float))

    if not (len(class_a) + len(class_b)):
        raise ValueError("labels not enough")
    else:
        features = np.concatenate((class_a, class_b), axis=1)
        labels = np.concatenate(
            (np.ones(class_a.shape[1]), -np.ones(class_b.shape[1])), axis=1)

    features = sg.RealFeatures(features)
    labels = sg.BinaryLabels(labels)

    return features, labels
Exemplo n.º 11
0
def get_multi_features(request):
    try:
        point_set_raw = json.loads(request.POST['point_set'])
    except:
        raise ValueError("cannot read click pts")

    x = []
    y = []
    labels = []
    for pt in point_set_raw:
        x.append(float(pt['x']))
        y.append(float(pt['y']))
        labels.append(float(pt['label']))

    n = len(set(labels))

    if not n:
        raise ValueError("0-labels")
    elif n == 1:
        raise ValueError("1-class-labels")
    else:
        features = np.array([x, y])

    features = sg.RealFeatures(features)
    labels = sg.MulticlassLabels(np.array(labels))

    return features, labels
    def __init__(self,
                 X,
                 y,
                 n_importance,
                 prior_log_pdf,
                 ridge=0.,
                 num_shogun_threads=1):
        self.n_importance = n_importance
        self.prior_log_pdf = prior_log_pdf
        self.ridge = ridge
        self.X = X
        self.y = y

        self.num_shogun_threads = num_shogun_threads

        # tell shogun to use 1 thread only
        logger.debug("Using Shogun with %d threads" % self.num_shogun_threads)
        sg.ZeroMean().parallel.set_num_threads(self.num_shogun_threads)

        # shogun representation of data
        self.sg_labels = sg.BinaryLabels(self.y)
        self.sg_feats_train = sg.RealFeatures(self.X.T)

        # ARD: set set theta, which is in log-scale, as kernel weights
        self.sg_kernel = sg.GaussianARDKernel(10, 1)

        self.sg_mean = sg.ZeroMean()
        self.sg_likelihood = sg.LogitLikelihood()
Exemplo n.º 13
0
def classify_perceptron(classifier, features, labels, learn=1, bias=0):
    perceptron = classifier(features, labels)
    perceptron.set_learn_rate(learn)
    perceptron.set_max_iter(100)
    perceptron.set_bias(bias)
    perceptron.train()
    
    size = 100
    x1 = np.linspace(0, 1, size)
    y1 = np.linspace(0, 1, size)
    x, y = np.meshgrid(x1, y1)
    
    test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y))))
    
    outl = perceptron.apply(test).get_labels()
    outv = perceptron.apply(test).get_values()
    
    # Normalize output
    outv /= np.max(outv)
    
    z_value = outv.reshape((size, size))
    z_value = np.transpose(z_value)
    
    z_label = outl.reshape((size, size))
    z_label = np.transpose(z_label)
    z_label = z_label + np.random.rand(*z_label.shape) * 0.01
    
    return z_value, z_label
Exemplo n.º 14
0
def regression(request):
    try:
        domain = json.loads(request.POST['axis_domain'])
        X = np.linspace(domain['horizontal'][0], domain['horizontal'][1], 100)
        x = np.array([X])
        feat = sg.RealFeatures(x)

        arguments = _read_data(request)

        tool = request.POST['regression']
        if (tool == 'LeastSquaresRegression'):
            ls = _train_ls(*arguments)
            y = _apply_ls(feat, ls)

        elif (tool == 'LinearRidgeRegression'):
            lrr = _train_lrr(*arguments)
            y = _apply_lrr(feat, lrr)

        elif (tool == 'KernelRidgeRegression'):
            krr, kernel, train = _train_krr(*arguments)
            y = _apply_krr(kernel, train, feat, krr)

        line_dot = []
        for i in xrange(len(X)):
            line_dot.append({'x': X[i], 'y': y[i]})
        return HttpResponse(json.dumps(line_dot))
    except:
        raise Http404
Exemplo n.º 15
0
def _train_clustering(point_set, distance_name, k):
    labels = np.array([0]*len(point_set))
    features = np.zeros((2, len(point_set)))

    for i in xrange(len(point_set)):
        features[0, i] = point_set[i]['x']
        features[1, i] = point_set[i]['y']
        labels[i] = point_set[i]['label']

    lab = sg.BinaryLabels(labels)
    train = sg.RealFeatures(features)
             
    if distance_name == "EuclideanDistance":
        distance = sg.EuclideanDistance(train, train)
    elif distance_name == "ManhattanMetric":
        distance = sg.ManhattanMetric(train, train)
    elif distance_name == "JensenMetric":
        distance = sg.JensenMetric(train, train)
    else:
        raise TypeError
                  
    kmeans = sg.KMeans(k, distance)
    kmeans.train()

    return kmeans
Exemplo n.º 16
0
def shogun_mmd(X, Y, kernel_width, null_samples=1000, median_samples=1000,
               cache_size=32):
    '''
    Run an MMD test using a Gaussian kernel.

    Parameters
    ----------
    X : row-instance feature array

    Y : row-instance feature array

    kernel_width : float
        The bandwidth of the RBF kernel (sigma).

    null_samples : int
        How many times to sample from the null distribution.

    Returns
    -------
    p_val : float
        The obtained p value of the test.

    stat : float
        The test statistic.

    null_samples : array of length null_samples
        The samples from the null distribution.
    '''
    import modshogun as sg
    mmd = sg.QuadraticTimeMMD()
    mmd.set_p(sg.RealFeatures(X.T.astype(np.float64)))
    mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64)))
    mmd.set_kernel(sg.GaussianKernel(cache_size, float(kernel_width)))

    mmd.set_num_null_samples(null_samples)
    samps = mmd.sample_null()
    stat = mmd.compute_statistic()

    p_val = np.mean(stat <= samps)
    return p_val, stat, samps
Exemplo n.º 17
0
def support_vector_regression(request):
    try:
        arguments = _read_data(request)
        svm = _train_svr(*arguments)
        domain = json.loads(request.POST['axis_domain'])
        x = np.linspace(domain['horizontal'][0], domain['horizontal'][1], 100)
        y = np.array(svm.apply(sg.RealFeatures(np.array([x]))).get_labels(),
                     dtype=np.float64)
        line_dot = []
        for i in xrange(len(x)):
            line_dot.append({'x': x[i], 'y': y[i]})
        return HttpResponse(json.dumps(line_dot))
    except:
        raise Http404
Exemplo n.º 18
0
def classify_gp(features,
                labels,
                kernel,
                domain,
                lik,
                learn,
                scale,
                returnValues=True):
    mean = sg.ZeroMean()
    inf = sg.EPInferenceMethod(kernel, features, mean, labels, lik)
    inf.set_scale(scale)
    gp = sg.GaussianProcessBinaryClassification(inf)
    best_width = 0.0
    best_param = 0
    best_degree = 0
    best_scale = 0.0

    if learn == 'ML2':
        inf.set_scale(1)
        if kernel.get_name() == 'GaussianKernel':
            kernel.set_width(1)
        grad = sg.GradientEvaluation(gp, features, labels,
                                     sg.GradientCriterion(), False)
        grad.set_function(inf)
        grad_search = sg.GradientModelSelection(grad)
        best_combination = grad_search.select_model()
        best_combination.apply_to_machine(gp)
        try:
            best_width = sg.GaussianKernel.obtain_from_generic(
                inf.get_kernel()).get_width()
        except:
            pass
        best_scale = inf.get_scale()
    gp.train()

    size = 50
    x1 = np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)
    y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], size)
    x, y = np.meshgrid(x1, y1)

    test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y))))

    if returnValues:
        out = gp.apply(test).get_values()
    else:
        out = gp.apply(test).get_labels()
    z = out.reshape((size, size))
    z = np.transpose(z)
    return x, y, z, best_width, best_param, best_scale
Exemplo n.º 19
0
def _predictive_process(feat_train, labels, noise_level, scale, kernel, domain,
                        learn, feat_induc, inf_select):
    variances, means, best_width, best_scale, best_sigma = _process(
        feat_train, labels, noise_level, scale, kernel, domain, learn,
        feat_induc, inf_select, True)
    size = 75
    x_test = np.array(
        [np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)])
    feat_test = sg.RealFeatures(x_test)
    y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], 50)
    D = np.zeros((len(y1), size))

    # evaluate normal distribution at every prediction point (column)
    for j in range(np.shape(D)[1]):
        # create gaussian distributio instance, expects mean vector and covariance matrix, reshape
        gauss = sg.GaussianDistribution(
            np.array(means[j]).reshape(1, ),
            np.array(variances[j]).reshape(1, 1))

        # evaluate predictive distribution for test point, method expects matrix
        D[:, j] = np.exp(gauss.log_pdf_multiple(y1.reshape(1, len(y1))))

    z = np.transpose(D)
    z_max = np.nanmax(z)
    z_min = np.nanmin(z)
    z_delta = 0.1 * (np.nanmax(z) - np.nanmin(z))

    result = []
    for i in xrange(len(feat_test.get_feature_matrix()[0])):
        result.append({
            'x': feat_test.get_feature_matrix()[0][i],
            'y': means[i],
            'range_upper': means[i] + 2 * np.sqrt(variances[i]),
            'range_lower': means[i] - 2 * np.sqrt(variances[i]),
            'best_width': float(best_width),
            'best_scale': float(best_scale),
            'best_sigma': float(best_sigma),
            "status": "ok",
            "domain": [z_min - z_delta, z_max + z_delta],
            "max": z_max + z_delta,
            "min": z_min - z_delta,
            "z": z.tolist()
        })

    return result
Exemplo n.º 20
0
def _process(x1_set, x2_set, kernel_width, kernel_name, degree):
    num = len(x1_set)
    if num == 0:
        raise Http404
    examples = np.zeros((2, num))
    for i in xrange(num):
        examples[0,i] = x1_set[i]
        examples[1,i] = x2_set[i]
    feat_train = sg.RealFeatures(examples)

    # construct covariance function
    if kernel_name == "LinearKernel":
        kernel = sg.LinearKernel(feat_train, feat_train)
    elif kernel_name == "PolynomialKernel":
        kernel = sg.PolyKernel(feat_train, feat_train, degree, True)
    elif kernel_name == "GaussianKernel":
        kernel = sg.GaussianKernel(feat_train, feat_train, kernel_width)
    kernel_matrix=kernel.get_kernel_matrix()
    return kernel_matrix.tolist()
Exemplo n.º 21
0
def _read_data(request):
    labels = []
    features = []
    data = json.loads(request.POST['point_set'])
    cost = float(request.POST['C'])
    tubeeps = float(request.POST['tube'])
    kernel_name = request.POST['kernel']
    for pt in data:
        labels.append(float(pt["y"]))
        features.append(float(pt["x"]))
    labels = np.array(labels, dtype=np.float64)
    num = len(features)
    if num == 0:
        raise TypeError
    examples = np.zeros((1, num))

    for i in xrange(num):
        examples[0, i] = features[i]

    lab = sg.RegressionLabels(labels)
    train = sg.RealFeatures(examples)
    kernel = get_kernel(request, train)
    return (cost, tubeeps, lab, kernel)
Exemplo n.º 22
0
def _read_data(request):
    labels = []
    features = []
    data = json.loads(request.POST['point_set'])
    tau = float(request.POST['Tau'])
    for pt in data:
        labels.append(float(pt["y"]))
        features.append(float(pt["x"]))
    labels = np.array(labels, dtype=np.float64)
    num = len(features)
    if num == 0:
        raise TypeError
    examples = np.zeros((1, num))

    for i in xrange(num):
        examples[0, i] = features[i]

    lab = sg.RegressionLabels(labels)
    train = sg.RealFeatures(examples)

    sigma = float(request.POST["sigma"])
    kernel = sg.GaussianKernel(train, train, sigma)

    return (tau, lab, kernel, train)
Exemplo n.º 23
0
import modshogun as sg
import data
import numpy as np

# load data
feature_matrix = data.swissroll()
# create features instance
features = sg.RealFeatures(feature_matrix)

# create Isomap converter instance
converter = sg.Isomap()

# set target dimensionality
converter.set_target_dim(2)

# compute embedding with Isomap method
embedding = converter.embed(features)

# enable landmark approximation
converter.set_landmark(True)
# set number of landmarks
converter.set_landmark_number(100)
# set number of threads
converter.parallel.set_num_threads(2)
# compute approximate embedding
approx_embedding = converter.embed(features)
# disable landmark approximation
converter.set_landmark(False)

# compute cosine distance matrix 'manually'
N = features.get_num_vectors()
Exemplo n.º 24
0
def _process(feat_train,
             labels,
             noise_level,
             scale,
             kernel,
             domain,
             learn,
             feat_induc,
             inf_select,
             return_values=False):
    n_dimensions = 1

    likelihood = sg.GaussianLikelihood()
    if learn == 'ML2':
        likelihood.set_sigma(1)
    else:
        likelihood.set_sigma(noise_level)
    covar_parms = np.log([2])
    hyperparams = {'covar': covar_parms, 'lik': np.log([1])}

    # construct covariance function
    SECF = kernel
    covar = SECF
    zmean = sg.ZeroMean()
    if str(inf_select) == 'ExactInferenceMethod':
        inf = sg.ExactInferenceMethod(SECF, feat_train, zmean, labels,
                                      likelihood)
        if learn == 'ML2':
            inf.set_scale(1)
        else:
            inf.set_scale(scale)
    elif str(inf_select) == 'FITCInferenceMethod':
        if feat_induc != None:
            inf = sg.FITCInferenceMethod(SECF, feat_train, zmean, labels,
                                         likelihood, feat_induc)
            if learn == 'ML2':
                inf.set_scale(1)
            else:
                inf.set_scale(scale)
        elif feat_induc == None:
            raise ValueError("Argument Error")

    # location of unispaced predictions
    size = 75
    x_test = np.array(
        [np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)])
    feat_test = sg.RealFeatures(x_test)

    gp = sg.GaussianProcessRegression(inf)

    best_width = 0.0
    best_scale = 0.0
    best_sigma = 0.0

    if learn == 'ML2':
        grad = sg.GradientEvaluation(gp, feat_train, labels,
                                     sg.GradientCriterion(), False)
        grad.set_function(inf)
        grad_search = sg.GradientModelSelection(grad)
        best_combination = grad_search.select_model()
        best_combination.apply_to_machine(gp)
        best_scale = inf.get_scale()
        best_sigma = sg.GaussianLikelihood.obtain_from_generic(
            inf.get_model()).get_sigma()
        if kernel.get_name() == 'GaussianKernel':
            best_width = sg.GaussianKernel.obtain_from_generic(
                inf.get_kernel()).get_width()

    gp.train()

    #    gp.set_return_type(sg.GaussianProcessRegression.GP_RETURN_COV)
    covariance = gp.get_variance_vector(feat_test)
    #    gp.set_return_type(sg.GaussianProcessRegression.GP_RETURN_MEANS)
    predictions = gp.get_mean_vector(feat_test)

    result = []
    for i in xrange(len(feat_test.get_feature_matrix()[0])):
        result.append({
            'x':
            feat_test.get_feature_matrix()[0][i],
            'y':
            predictions[i],
            'range_upper':
            predictions[i] + 2 * np.sqrt(covariance[i]),
            'range_lower':
            predictions[i] - 2 * np.sqrt(covariance[i]),
            'best_width':
            float(best_width),
            'best_scale':
            float(best_scale),
            'best_sigma':
            float(best_sigma)
        })

    if not return_values:
        return result
    elif return_values:
        return covariance, predictions, best_width, best_scale, best_sigma
Exemplo n.º 25
0
import numpy as np
import modshogun as sg

X = np.random.randn(100, 3)
Y = np.random.randn(100, 3) + .5

mmd = sg.QuadraticTimeMMD()
mmd.set_p(sg.RealFeatures(X.T))
mmd.set_q(sg.RealFeatures(Y.T))
mmd.set_kernel(sg.GaussianKernel(32, 1))
mmd.set_num_null_samples(200)
samps = mmd.sample_null()
stat = mmd.compute_statistic()
Exemplo n.º 26
0
def classify_svm(classifier,
                 features,
                 labels,
                 kernel,
                 domain,
                 learn,
                 value,
                 C=1,
                 returnValues=True):
    if learn == 'GridSearch':
        svm = classifier()
        root = sg.ModelSelectionParameters()
        c1 = sg.ModelSelectionParameters("C1")
        root.append_child(c1)
        c1.build_values(1.0, 10.0, sg.R_LINEAR, 2)

        c2 = sg.ModelSelectionParameters("C2")
        root.append_child(c2)
        c2.build_values(1.0, 10.0, sg.R_LINEAR, 2)

        if kernel.get_name() == 'GaussianKernel':
            param_kernel = sg.ModelSelectionParameters("kernel", kernel)
            width = sg.ModelSelectionParameters("width")
            width.build_values(0.0, 10.0, sg.R_LINEAR, 0.5)
            param_kernel.append_child(width)
            root.append_child(param_kernel)

        elif kernel.get_name() == 'PolyKernel':
            param_kernel = sg.ModelSelectionParameters("kernel", kernel)
            degree = sg.ModelSelectionParameters("degree")
            if value:
                degree.build_values(value[0], value[1], sg.R_LINEAR)
            else:
                degree.build_values(0, 5, sg.R_LINEAR)
            param_kernel.append_child(degree)
            root.append_child(param_kernel)

        elif kernel.get_name() == 'LinearKernel':
            param_kernel = sg.ModelSelectionParameters("kernel", kernel)
            root.append_child(param_kernel)

        pos = 0
        neg = 0
        for i in range(0, labels.get_num_labels()):
            if labels.get_label(i) == 1:
                pos += 1
            else:
                neg += 1
        if pos < 2 or neg < 2:

            class LabelsError(Exception):
                pass

            raise LabelsError('Need at least two labels from one class')
        elif pos < 3 or neg < 3:
            splitting_strategy = sg.StratifiedCrossValidationSplitting(
                labels, 2)
        else:
            splitting_strategy = sg.StratifiedCrossValidationSplitting(
                labels, 3)
        evaluation_criterium = sg.ContingencyTableEvaluation(sg.ACCURACY)
        cross = sg.CrossValidation(svm, features, labels, splitting_strategy,
                                   evaluation_criterium)
        cross.set_num_runs(2)
        grid_search = sg.GridSearchModelSelection(cross, root)
        best_combination = grid_search.select_model()
        best_combination.apply_to_machine(svm)

    else:
        svm = classifier(C, kernel, labels)

    svm.train(features)

    size = 100
    x1 = np.linspace(domain['horizontal'][0], domain['horizontal'][1], size)
    y1 = np.linspace(domain['vertical'][0], domain['vertical'][1], size)
    x, y = np.meshgrid(x1, y1)

    test = sg.RealFeatures(np.array((np.ravel(x), np.ravel(y))))
    kernel.init(features, test)

    if returnValues:
        out = svm.apply(test).get_values()
    else:
        out = svm.apply(test).get_labels()
    z = out.reshape((size, size))
    z = np.transpose(z)
    return x, y, z
Exemplo n.º 27
0
def rbf_mmd_test(X,
                 Y,
                 bandwidth='median',
                 null_samples=1000,
                 median_samples=1000,
                 cache_size=32):
    '''
    Run an MMD test using a Gaussian kernel.

    Parameters
    ----------
    X : row-instance feature array

    Y : row-instance feature array

    bandwidth : float or 'median'
        The bandwidth of the RBF kernel (sigma).
        If 'median', estimates the median pairwise distance in the
        aggregate sample and uses that.

    null_samples : int
        How many times to sample from the null distribution.

    median_samples : int
        How many points to use for estimating the bandwidth.

    Returns
    -------
    p_val : float
        The obtained p value of the test.

    stat : float
        The test statistic.

    null_samples : array of length null_samples
        The samples from the null distribution.

    bandwidth : float
        The used kernel bandwidth
    '''

    if bandwidth == 'median':
        from sklearn.metrics.pairwise import euclidean_distances
        sub = lambda feats, n: feats[np.random.choice(
            feats.shape[0], min(feats.shape[0], n), replace=False)]
        Z = np.r_[sub(X, median_samples // 2), sub(Y, median_samples // 2)]
        D2 = euclidean_distances(Z, squared=True)
        upper = D2[np.triu_indices_from(D2, k=1)]
        kernel_width = np.median(upper, overwrite_input=True)
        bandwidth = np.sqrt(kernel_width / 2)
        # sigma = median / sqrt(2); works better, sometimes at least
        del Z, D2, upper
    else:
        kernel_width = 2 * bandwidth**2

    mmd = sg.QuadraticTimeMMD()
    mmd.set_p(sg.RealFeatures(X.T.astype(np.float64)))
    mmd.set_q(sg.RealFeatures(Y.T.astype(np.float64)))
    mmd.set_kernel(sg.GaussianKernel(cache_size, kernel_width))

    mmd.set_num_null_samples(null_samples)
    samps = mmd.sample_null()
    stat = mmd.compute_statistic()

    p_val = np.mean(stat <= samps)
    return p_val, stat, samps, bandwidth
Exemplo n.º 28
0
 def fit_predict(self, x):
     features_train = modshogun.RealFeatures(x)
     kcc = RealFeatures(self.cluster_centers_)
     discc = self.distance(kcc, features_train).get_distance_matrix()
     return np.copy(discc.argsort(axis=0)[0, :]).T
Exemplo n.º 29
0
def get_estimates(gen, sigmas=None, n_reps=100, n_null_samps=1000,
                  cache_size=64, rep_states=False, name=None,
                  save_samps=False, thresh_levels=(.2, .1, .05, .01)):
    if sigmas is None:
        sigmas = np.logspace(-1.7, 1.7, num=30)
    sigmas = np.asarray(sigmas)

    mmd = sg.QuadraticTimeMMD()
    mmd.set_num_null_samples(n_null_samps)
    mmd_mk = mmd.multikernel()
    for s in sigmas:
        mmd_mk.add_kernel(sg.GaussianKernel(cache_size, 2 * s**2))

    info = OrderedDict()
    for k in 'sigma rep mmd_est var_est p'.split():
        info[k] = []
    thresh_names = []
    for l in thresh_levels:
        s = 'thresh_{}'.format(l)
        thresh_names.append(s)
        info[s] = []
    if save_samps:
        info['samps'] = []

    thresh_prob = 1 - np.asarray(thresh_levels)

    bar = pb.ProgressBar()
    if name is not None:
        bar.start()
        bar.widgets.insert(0, '{} '.format(name))
    for rep in bar(xrange(n_reps)):
        if rep_states:
            rep = np.random.randint(0, 2**32)
            X, Y = gen(rs=rep)
        else:
            X, Y = gen()
        n = X.shape[0]
        assert Y.shape[0] == n
        mmd.set_p(sg.RealFeatures(X.T))
        mmd.set_q(sg.RealFeatures(Y.T))

        info['sigma'].extend(sigmas)
        info['rep'].extend([rep] * len(sigmas))

        stat = mmd_mk.compute_statistic()
        info['mmd_est'].extend(stat / (n / 2))

        samps = mmd_mk.sample_null()
        info['p'].extend(np.mean(samps >= stat, axis=0))
        if save_samps:
            info['samps'].extend(samps.T)

        info['var_est'].extend(mmd_mk.compute_variance_h1())

        threshes = np.asarray(mquantiles(samps, prob=thresh_prob, axis=0))
        for s, t in zip(thresh_names, threshes):
            info[s].extend(t)

    info = pd.DataFrame(info)
    info.set_index(['sigma', 'rep'], inplace=True)
    return info