예제 #1
0
def _csv(outfile, binary, save=True, data=None):
    # data: instances
    # load a .csv file where all the data in the file mark the relative postions,
    # not values if save = true, save [[label, *features]] to standard csv file
    if save:
        label, sparse_data = sparsify(data)
        with open(outfile, 'w+') as fileobj:
            serialize = csv.writer(fileobj)
            data = np.concatenate(
                (np.array(label)[:, np.newaxis], sparse_data.toarray()),
                axis=1)
            for instance in data.tolist():
                serialize.writerow(instance)
    else:
        # TODO: throw exception if FileNotFoundError
        data = np.genfromtxt(outfile, delimiter=',')
        num_instances = data.shape[0]
        labels = data[:, :1]
        feats = data[:, 1:]
        features = csr_matrix(feats)
        if binary:
            return csr_mat_to_instances(features,
                                        np.squeeze(labels),
                                        binary=True)
        else:
            return csr_mat_to_instances(features,
                                        np.squeeze(labels),
                                        binary=False)
예제 #2
0
def _pickle(outfile, binary, save=True, data=None):
    """A fast method for saving and loading datasets as python objects.

    Args:
        outfile (str): The destination file.
        save (boolean, optional): If True, serialize, if False, load.

    """
    if save:
        label, sparse_data = sparsify(data)
        with open(outfile, 'wb+') as fileobj:
            pickle.dump({
                'labels': label,
                'features': sparse_data
            }, fileobj, pickle.HIGHEST_PROTOCOL)
    else:
        # TODO: throw exception if FileNotFoundError
        with open(outfile, 'rb') as fileobj:
            data = pickle.load(fileobj)
            if binary:
                return csr_mat_to_instances(data['features'],
                                            data['labels'],
                                            binary=True)
            else:
                return csr_mat_to_instances(data['features'],
                                            data['labels'],
                                            binary=False)
예제 #3
0
    def train(self):
        """
        train classifier based on training data and corresponding label
        :param X: training feature vectors in matrix form
        :param y: corresponding labels in array
        :return: None
        """
        if isinstance(self.training_instances, List):
            y_list, X_list = sparsify(self.training_instances)
            num_instances = len(y_list)
            y, X = np.array(y_list).reshape(
                (num_instances, 1)), X_list.toarray().reshape(
                    (num_instances, self.num_features))

        else:
            X, y = self.training_instances.numpy()
            num_instances = len(y)
            y, X = y.reshape((num_instances, 1)), X

        n, m = len(X[0]), len(X)

        weights = Variable(n)
        bias = Variable()
        loss_func = sum_entries(pos(1 - mul_elemwise(y, X * weights + bias)))
        reg_term = norm(weights, 'inf')
        slack_factor = self.coef

        # define optimization problem
        prob = Problem(Minimize(loss_func + slack_factor * reg_term))
        prob.solve()
        self.weight_vector = weights.value
        self.bias = bias.value
예제 #4
0
    def attack(self, Instances) -> List[Instance]:
        if self.num_features == 0:
            self.num_features = Instances[0].get_feature_count()
        benign_instances = []
        malicious_instances = []
        for instance in Instances:
            if instance.label < 0:
                benign_instances.append(instance)

        # make negative instances into numpy array for calculate KDE distances
        y_list, X_list = sparsify(benign_instances)
        num_instances = len(y_list)
        y, X = np.array(y_list).reshape(
            (num_instances, 1)), X_list.toarray().reshape(
                (num_instances, self.num_features))

        transformed_instances = []
        for instance in Instances:
            if instance.label < 0:
                transformed_instances.append(instance)
            else:
                transformed_instances.append(self.gradient_descent(
                    instance, X))

        #plt.show()
        return transformed_instances
예제 #5
0
    def train(self):
        '''Optimize the asymmetric dual problem and return optimal w and b.'''
        if not self.training_instances:
            raise ValueError('Must set training instances before training')
        c = 10

        if isinstance(self.training_instances, List):
            y, X = sparsify(self.training_instances)
            y, X = np.array(y), X.toarray()
        else:
            X, y = self.training_instances.numpy()

        i_neg = np.array([
            ins[1] for ins in zip(y, X)
            if ins[0] == self.negative_classification
        ])
        i_pos = np.array([
            ins[1] for ins in zip(y, X)
            if ins[0] == self.positive_classification
        ])
        # centroid can be computed in multiple ways
        n_centroid = np.mean(i_neg)
        Mk = ((1 - self.c_delta * np.fabs(n_centroid - i_pos) /
               (np.fabs(n_centroid) + np.fabs(i_pos))) *
              ((n_centroid - i_pos)**2))
        Zks = np.zeros_like(i_neg)
        Mk = np.concatenate((Mk, Zks))
        TMk = np.concatenate((n_centroid - i_pos, Zks))
        ones_col = np.ones((i_neg.shape[1], 1))
        pn = np.concatenate((i_pos, i_neg))
        pnl = np.concatenate(
            (np.ones(i_pos.shape[0]), -np.ones(i_neg.shape[0])))
        col_neg, row_sum = i_neg.shape[1], i_pos.shape[0] + i_neg.shape[0]

        # define cvxpy variables
        w = Variable(col_neg)
        b = Variable()
        xi0 = Variable(row_sum)
        t = Variable(row_sum)
        u = Variable(row_sum, col_neg)
        v = Variable(row_sum, col_neg)

        constraints = [
            xi0 >= 0, xi0 >= 1 - mul(pnl, (pn * w + b)) + t,
            t >= mul(Mk, u) * ones_col,
            mul(TMk, (-u + v)) == 0.5 * (1 + pnl) * w.T, u >= 0, v >= 0
        ]

        # objective
        obj = cvx.Minimize(0.5 * (cvx.norm(w)) + c * cvx.sum_entries(xi0))
        prob = cvx.Problem(obj, constraints)

        if OPT_INSTALLED:
            prob.solve(solver='CVXOPT')
        else:
            prob.solve()

        self.weight_vector = [np.array(w.value).T][0]
        self.bias = b.value
예제 #6
0
    def train(self):
        '''Optimize the asymmetric dual problem and return optimal w and b.'''
        if not self.training_instances:
            raise ValueError('Must set training instances before training')
        c = 10

        if isinstance(self.training_instances, List):
            y, X = sparsify(self.training_instances)
            y, X = np.array(y), X.toarray()
        else:
            X, y = self.training_instances.numpy()

        i_neg = np.array([
            ins[1] for ins in zip(y, X)
            if ins[0] == self.negative_classification
        ])
        i_pos = np.array([
            ins[1] for ins in zip(y, X)
            if ins[0] == self.positive_classification
        ])
        ones_col = np.ones((i_neg.shape[1], 1))
        pn = np.concatenate((i_pos, i_neg))
        pl = np.ones(i_pos.shape[0])
        nl = -np.ones(i_neg.shape[0])
        pnl = np.concatenate((pl, nl))
        xj_min = np.full_like(pn, self.xmin)
        xj_max = np.full_like(pn, self.xmax)
        ones_mat = np.ones_like(pnl)
        col_neg, row_sum = i_neg.shape[1], i_pos.shape[0] + i_neg.shape[0]

        # define cvxpy variables
        w = cvx.Variable(col_neg)
        b = cvx.Variable()
        xi0 = cvx.Variable(row_sum)
        t = cvx.Variable(row_sum)
        u = cvx.Variable(row_sum, col_neg)
        v = cvx.Variable(row_sum, col_neg)

        constraints = [
            xi0 >= 0, xi0 >= 1 - mul(pnl, (pn * w + b)) + t,
            t >= mul(self.c_f,
                     (mul(xj_max - pn, v) - mul(xj_min - pn, u)) * ones_col),
            u - v == 0.5 * (1 + pnl) * w.T, u >= 0, v >= 0
        ]
        # objective
        obj = cvx.Minimize(0.5 * (cvx.norm(w)) + c * cvx.sum_entries(xi0))
        prob = cvx.Problem(obj, constraints)

        if OPT_INSTALLED:
            prob.solve(solver='CVXOPT')
        else:
            prob.solve()

        self.weight_vector = [np.array(w.value).T][0]
        self.bias = b.value
예제 #7
0
    def train(self, instances):
        """Train on the set of training instances using the underlying
        sklearn object.

        Args:
            instances (List[Instance]): training instances or emaildataset
            object.

        """
        if isinstance(instances, List):
            (y, X) = sparsify(instances)
            self.learner.fit(X.toarray(), y)
        else:
            self.learner.fit(instances.data[0], instances.data[1])
예제 #8
0
    def train(self):
        """
        Opitimize weight vector using FDROP algorithm
        i.e. formula (6) described in Globerson and Roweis paper
        Returns: optimized weight vector

        """
        if isinstance(self.training_instances, List):
            y_list, X_list = sparsify(self.training_instances)
            num_instances = len(y_list)
            y, X = np.array(y_list).reshape(
                (num_instances, 1)), X_list.toarray().reshape(
                    (num_instances, self.num_features))

        else:
            X, y = self.training_instances.numpy()
            num_instances = len(y)
            y, X = y.reshape((num_instances, 1)), X

        C = self.hinge_loss_multiplier
        print("current C value: {}".format(C))
        print("current K: {}".format(self.max_feature_deletion))
        print(X.shape)
        K = self.max_feature_deletion

        w = Variable(self.num_features)  # weight vector
        b = Variable()  # bias term
        t = Variable(num_instances)
        z = Variable(num_instances)
        v = Variable(num_instances, self.num_features)

        loss = sum_entries(pos(1 - mul_elemwise(y, X * w + b) + t))

        constraints = [t >= K * z + sum_entries(v, axis=1), v >= 0]
        constraints.extend([
            z[i] + v[i, :] >= y[i] * mul_elemwise(X[i], w).T
            for i in range(num_instances)
        ])
        obj = Minimize(0.5 * (sum_squares(w)) + C * loss)

        prob = Problem(obj, constraints)
        prob.solve(solver=SCS)
        # print("training completed, here is the learned weight vector:")

        # weight_vector is of shape (1, self.num_features)
        self.weight_vector = [np.array(w.value).T][0]
        self.bias = b.value
예제 #9
0
    def decision_function_(self, instances):
        """Use the model to determine the decision function for each instance.

        Args:
            instances (List[Instance]) or (Instance): training or test instances.

        Returns:
            decision values (List(int))

        """
        if isinstance(instances, List):
            (y, X) = sparsify(instances)
            f = self.learner.decision_function(X)
        elif type(instances) == Instance:
            f = self.learner.decision_function(
                instances.get_feature_vector().get_csr_matrix())[0]
        else:
            self.learner.dicision_function(instances.features)
        return f
예제 #10
0
    def predict_log_proba(self, instances):
        """Use the model to determine log probability of adversarial classification.

        Args:
            instances (List[Instance]) or (Instance): training or test instances.
            instances should be a csr_matrix representation

        Returns:
            probability of adversarial classification (List(int))

        """
        if isinstance(instances, List):
            (y, X) = sparsify(instances)
            full_probs = self.learner.predict_log_proba(X)
            probs = [x[0] for x in full_probs]
        elif type(instances) == Instance:
            matrix = instances.get_feature_vector().get_csr_matrix()
            probs = self.learner.predict_log_proba(matrix.toarray())
        else:
            probs = self.learner.predict_log_proba(instances.features)
        return probs
예제 #11
0
    def predict(self, instances):
        """Predict classification labels for the set of instances using
        the predict function of the sklearn classifier.

        Args:
            instances should be a Email Dataset
            instances (List[Instance]) or (Instance): training or test instances.

        Returns:
            label classifications (List(int))

        """
        if isinstance(instances, List):
            (y, X) = sparsify(instances)
            predictions = self.learner.predict(X.toarray())
        elif type(instances) == Instance:
            predictions = self.learner.predict(
                instances.get_feature_vector().get_csr_matrix().toarray())[0]
        else:
            predictions = self.learner.predict(instances.features)
        return predictions
예제 #12
0
    def train(self):
        """
         Opitimize weight vector using FDROP algorithm
         i.e. formula (6) described in Globerson and Roweis paper
         Returns: optimized weight vector

         """
        if isinstance(self.training_instances, List):
            y_list, X_list = sparsify(self.training_instances)
            num_instances = len(y_list)
            y, X = np.array(y_list).reshape(
                (num_instances, 1)), X_list.toarray().reshape(
                    (num_instances, self.num_features))

        else:
            X, y = self.training_instances.numpy()
            num_instances = len(y)
            y, X = y.reshape((num_instances, 1)), X

        # append another column at X for bias
        bias_col = np.ones_like(y.T)
        X_prime = np.insert(X, X.shape[1], bias_col, axis=1)

        C = self.hinge_loss_multiplier
        print("current C value(hinge loss multipler): {}".format(C))
        print("current K(maximum feature deletion): {}".format(
            self.max_feature_deletion))
        # print(X.shape)
        # print(y.shape)
        K = self.max_feature_deletion
        w = Variable(self.num_features + 1)  # weight vector
        # b = Variable()  # bias term
        t = Variable(num_instances)
        z = Variable(num_instances)
        v = Variable(num_instances, (self.num_features + 1))
        loss_f = Variable(num_instances)

        # bias is implemented as the last column of X(a extra feature vector of only 1)
        # bias in the weight vector is not calculated in the regularization

        constraints = [t >= K * z + sum_entries(v, axis=1), v >= 0]
        constraints.extend([
            z[i] + v[i, :] >= y[i] * mul_elemwise(X_prime[i], w).T
            for i in range(num_instances)
        ])
        constraints.extend([loss_f[i] >= 0 for i in range(num_instances)])
        constraints.extend([
            loss_f[i] >= (1 - y[i] * (X_prime[i] * w) + t[i])
            for i in range(num_instances)
        ])
        obj = Minimize(0.5 * (sum_squares(w[:-1])) + C * sum_entries(loss_f))

        #  constraints = [t >= K * z + sum_entries(v, axis=1),v >= 0]
        #  constraints.extend([z[i] + v[i, :] >=
        #                      y[i] * mul_elemwise(X[i], w) for i in range(num_instances)])
        #  constraints.extend([])
        #  constraints.extend([loss_f[i] >= 0 for i in range(num_instances)])
        #  constraints.extend([loss_f[i] >= (1 - y[i] * (X[i] * w + b) + t[i])
        #  for i in range(num_instances)])
        #  obj = Minimize(0.5 * (sum_squares(w)) + C * sum_entries(loss_f))

        prob = Problem(obj, constraints)
        # switch another server to solve the scalability issue
        prob.solve(solver=SCS)
        # print("training completed, here is the learned weight vector:")
        # weight_vector is of shape (1, self.num_features)
        self.weight_vector = [np.array(w.value).T][0][0][:-1]
        self.bias = [np.array(w.value).T][0][0][-1]
        self.t = t
        print(self.weight_vector)
        print(self.bias)

        print("final weight vector shape: {}".format(self.weight_vector.shape))
        # print("bias term:{}".format(self.bias))
        top_idx = [
            i for i in np.argsort(np.absolute(self.weight_vector))[-10:]
        ]
        print("indices with top 10 absolute value:")
        for i in top_idx:
            print("index No.{} with value {}".format(i, self.weight_vector[i]))