Пример #1
0
def csr_mat_to_instances(csr_mat, labels, binary=False):
    """
    Return a list of instances
    :param nd_arr:
    :param labels:
    :return:
    """
    data = csr_mat.data
    indices = csr_mat.indices
    indptr = csr_mat.indptr
    instance_len, num_features = csr_mat.shape
    instance_lst = []
    for i in range(instance_len):
        label = labels[i]
        instance_data = data[indptr[i]:indptr[i + 1]]
        instance_indices = list(indices[indptr[i]:indptr[i + 1]])
        if binary:
            instance_lst.append(
                Instance(label,
                         BinaryFeatureVector(num_features, instance_indices)))
        else:
            instance_lst.append(
                Instance(
                    label,
                    RealFeatureVector(num_features, instance_indices,
                                      instance_data)))
    return instance_lst
Пример #2
0
def find_centroid(instances: List[Instance]):
    num_features = instances[0].get_feature_vector().feature_count
    indices = []
    data = []
    for i in range(num_features):
        sum = 0
        for instance in instances:
            if instance.label == -1:
                sum += instance.get_feature_vector().get_feature(i)
        sum /= num_features
        if sum != 0:
            indices.append(i)
            data.append(sum)
    return Instance(-1,RealFeatureVector(num_features, indices, data))
Пример #3
0
def find_min(instances: List[Instance]):
    num_features = instances[0].get_feature_vector().feature_count
    indices = []
    data = []
    for i in range(num_features):
        min = 1000
        for instance in instances:
            value = instance.get_feature_vector().get_feature(i)
            if value <= min:
                min = value
        if min != 0:
            indices.append(i)
            data.append(min)
    return RealFeatureVector(num_features, indices, data)
Пример #4
0
def find_max(instances: List[Instance]):
    num_features = instances[0].get_feature_vector().feature_count
    indices = []
    data = []
    for i in range(num_features):
        max = 0
        for instance in instances:
            value = instance.get_feature_vector().get_feature(i)
            if value >= max:
                max = value
        if max != 0:
            indices.append(i)
            data.append(max)
    return RealFeatureVector(num_features, indices, data)
Пример #5
0
    def _generate_inst(self):
        """
        :return: a properly generated Instance that has feature vector self.x
                 and label self.y
        """

        indices = []
        data = []
        for i, val in enumerate(self.x):
            if val != 0:
                indices.append(i)
                data.append(val)

        # Generate new instance
        fv = RealFeatureVector(len(self.x), indices, data)
        self.inst = Instance(self.y, fv)
Пример #6
0
    def coordinate_greedy(self, instance: Instance) -> Instance:
        """
         Greddily update the feature to incrementally improve the attackers
         utility. run CS from L random starting points in the feature space. We
         repeat the alternation until differences of instances are small or
         max_change is reached.

         no_improve_count: number of points
         Q: transofrm cost(we use quodratic distance)
         GreedyImprove: using the coordinate descent algorithm.
        :param instance:
        :return: if the result is still classified as +1, we return origin
                 instance else we return the improved.
        """
        indices = [i for i in range(0, self.num_features)]
        x = xk = instance.get_csr_matrix().toarray()[0]
        no_improve_count = 0
        shuffle(indices)
        count = 0
        for i in indices:

            xkplus1 = self.minimize_transform(xk, x, i)
            oldQ = self.transform_cost(xk, x)
            newQ = self.transform_cost(xkplus1, x)
            # step_change = np.log(newQ) / np.log(oldQ)
            # using difference instead of log ratio for convergence check

            xk = xkplus1
            no_improve_count += 1
            if newQ - oldQ > 0 and oldQ != 0:
                step_change = np.log(newQ - oldQ)
                if step_change <= self.epsilon:
                    break
            if no_improve_count > self.max_change:
                break
            count += 1
        mat_indices = [x for x in range(0, self.num_features) if xk[x] != 0]
        mat_data = [xk[x] for x in range(0, self.num_features) if xk[x] != 0]
        new_instance = Instance(
            -1, RealFeatureVector(self.num_features, mat_indices, mat_data))

        return new_instance
Пример #7
0
def load_dataset(emailData: EmailDataset) -> List[Instance]:
    """
    Conversion from dataset object into a list of instances
    :param emailData:
    """

    instances = []
    num_features = emailData.shape[1]
    indptr = emailData.features.indptr
    indices = emailData.features.indices
    data = emailData.features.data
    for i in range(0, emailData.num_instances):
        if emailData.binary:
            tmp_vector = BinaryFeatureVector(num_features, indices[indptr[i]:indptr[i + 1]].tolist())
        else:
            instance_data = data[indptr[i]:indptr[i + 1]].tolist()
            tmp_vector = RealFeatureVector(num_features, indices[indptr[i]:indptr[i + 1]].tolist(),
                                           instance_data)
        instances.append(Instance(emailData.labels[i], tmp_vector))
    return instances
Пример #8
0
def nd_arr_to_instances(nd_arr, labels=None, binary=False):
    """
    Return a list of instances
    :param nd_arr:
    :param labels:
    :param binary:
    :return:
    """
    num_instances = nd_arr.shape[0]
    if labels is None:
        labels = nd_arr[:, :1]
        data = nd_arr[:, 1:]
        num_features = nd_arr.shape[1] - 1
    else:
        data = nd_arr
        num_features = nd_arr.shape[1]

    instance_lst = []
    for i in range(num_instances):
        if binary:
            mat_indices = [
                x for x in range(0, num_features) if data[i][x] != 0
            ]
            instance_lst.append(
                Instance(labels[i],
                         BinaryFeatureVector(num_instances, mat_indices)))
        else:
            mat_indices = [
                x for x in range(0, num_features) if data[i][x] != 0
            ]
            mat_data = [
                data[i][x] for x in range(0, num_features) if data[0][x] != 0
            ]
            instance_lst.append(
                Instance(
                    labels[i],
                    RealFeatureVector(num_instances, mat_indices, mat_data)))
    return instance_lst
Пример #9
0
    def coordinate_greedy(self, instance: Instance):
        """
         Greedily update the feature to incrementally improve the attackers utility.
         run CS from L random starting points in the feature space. We repeat the
         alternation until differences of instances are small or max_change is
         reached.

         no_improve_count: number of points
         Q: transofrm cost(we use quodratic distance)
         GreedyImprove: using the coordinate descent algorithm.
        :param instance:
        :return: if the result is still classified as +1, we return origin instance
                 else we return the improved.
        """
        instance_len = instance.get_feature_count()
        if DEBUG:
            iteration_list = []
            Q_value_list = []

        x = xk = instance.get_csr_matrix().toarray()[0]

        # converge is used for checking convergance conditions
        # if the last convergence_time iterations all satisfy <= eplison condition
        # ,the attack successfully finds a optimum
        converge = 0

        for iteration_time in range(self.max_iteration):
            i = randint(0, instance_len - 1)

            #calcualte cost function and greediy improve from a random feature i
            xkplus1 = self.minimize_transform(xk, x, i)
            old_q = self.transform_cost(xk, x)
            new_q = self.transform_cost(xkplus1, x)

            # check whether Q_value actually descends and converges to a minimum
            # plot the iteration and Q_values using matplotlib
            #if DEBUG:
            #    iteration_list.append(iteration_time)
            #    Q_value_list.append(new_q)

            # if new_q < 0:
            #     print("Attack finishes because Q is less than 0")
            #     break

            if new_q - old_q <= 0:
                xk = xkplus1
                step_change = old_q - new_q
                # the np.log() may not converge in special cases
                # makes sure the cost function actually converges
                # alternative implementation?
                #step_change = np.log(new_q) / np.log(old_q)
                #step_change = np.log(old_q - new_q)

                if step_change <= self.epsilon:
                    converge += 1
                    if converge >= self.convergence_time:
                        #print("Attack finishes because of convergence!")
                        break

        #if DEBUG:
        #    plt.plot(iteration_list,Q_value_list)

        mat_indices = [x for x in range(0, self.num_features) if xk[x] != 0]
        mat_data = [xk[x] for x in range(0, self.num_features) if xk[x] != 0]
        new_instance = Instance(
            -1, RealFeatureVector(self.num_features, mat_indices, mat_data))
        return new_instance
Пример #10
0
    def attack(self, instances) -> List[Instance]:
        """
        Performs a data modification attack
        :param instances: the input instances
        :return: the attacked instances
        """

        if len(instances) == 0:
            raise ValueError('Need at least one instance.')

        self.instances = instances
        self.return_instances = deepcopy(self.instances)
        self._calculate_constants()

        fv_dist = 0.0
        theta_dist = np.linalg.norm(self.theta - self.target_theta)
        iteration = 0
        old_update_vector = 0
        while (iteration == 0
               or (fv_dist > self.alpha and iteration < self.max_iter)):

            print('Iteration: ',
                  iteration,
                  ' - FV distance: ',
                  fv_dist,
                  ' - theta distance: ',
                  theta_dist,
                  ' - beta: ',
                  self.beta,
                  sep='')

            begin = time.time()

            self._write_to_file()

            # Gradient descent with momentum
            gradient = self._calc_gradient()

            if self.verbose:
                print('\nGradient:\n', gradient, sep='')

            update_vector = (self.eta * old_update_vector +
                             (1 - self.eta) * gradient)
            self.fvs -= self.beta * update_vector
            self._project_fvs()

            if self.verbose:
                print('\nFeature Vectors:\n', self.fvs, '\n', sep='')

            # Update variables
            self._calc_theta()
            fv_dist = np.linalg.norm(self.fvs - self.old_fvs)
            theta_dist = np.linalg.norm(self.theta - self.target_theta)
            self.old_fvs = deepcopy(self.fvs)
            self.beta *= 1 / (1 + self.decay * iteration)
            old_update_vector = deepcopy(update_vector)

            self._cleanup_files()

            end = time.time()
            print('TIME: ', end - begin, 's', sep='')

            iteration += 1

        print('Iteration: FINAL - FV distance: ',
              fv_dist,
              ' - theta distance: ',
              theta_dist,
              ' - alpha: ',
              self.alpha,
              ' - beta: ',
              self.beta,
              '\n',
              sep='')

        if self.verbose:
            print('\n\nTarget Theta:\n\n', self.target_theta, '\n\nTheta:\n\n',
                  self.theta, '\n')

        # Build appropriate RealFeatureVectors
        feature_count = self.fvs.shape[1]
        for i, fv in enumerate(self.fvs):
            indices = []
            data = []
            for j, val in enumerate(fv):
                if val != 0:
                    indices.append(j)
                    data.append(val)

            self.return_instances[i].feature_vector = RealFeatureVector(
                feature_count, indices, data)

        return self.return_instances
Пример #11
0
    def gradient_descent(self, instance: Instance, neg_instances):
        #store iteration and objective values for plotting....
        #iteration_lst = []
        #objective_lst = []

        # attack_intance-> np array
        attack_instance = instance.get_csr_matrix().toarray()
        root_instance = attack_instance
        obj_function_value_list = []

        # store the modified gradient descent attack instances
        # find a list of potential neg_instances, the closest distance, and updated gradients
        candidate_attack_instances = [attack_instance]
        attacker_score = self.get_score(attack_instance)
        closer_neg_instances, dist, grad_update = self.compute_gradient(
            attack_instance, neg_instances)
        obj_func_value = attacker_score + self.lambda_val * dist
        obj_function_value_list.append(obj_func_value)

        for iter in range(self.max_iter):
            # no d(x,x_prime) is set to limit the boundary of attacks
            # compute the obj_func_value of the last satisfied instance
            # append to the value list
            #iteration_lst.append(iter)
            #objective_lst.append(obj_func_value)

            past_instance = candidate_attack_instances[-1]
            new_instance = self.update_within_boundary(past_instance,
                                                       root_instance,
                                                       grad_update)

            # compute the gradient and objective function value of the new instance
            closer_neg_instances, dist, new_grad_update = \
                self.compute_gradient(new_instance, closer_neg_instances)
            new_attacker_score = self.get_score(new_instance)
            obj_func_value = new_attacker_score + self.lambda_val * dist

            # check convergence information
            # we may reach a local min if the function value does not change
            # if obj_func_value == obj_function_value_list[-1]:
            #    print("Local min is reached. Iteration: %d, Obj value %d" %(iter,obj_func_value))
            #    mat_indices = [x for x in range(0, self.num_features) if new_instance[0][x] != 0]
            #    mat_data = [new_instance[0][x] for x in range(0, self.num_features) if new_instance[0][x] != 0]
            #    return Instance(-1, RealFeatureVector(self.num_features, mat_indices, mat_data))

            # check a small epsilon(difference is a small value after
            # several iterations)
            if self.check_convergence_info(obj_func_value,
                                           obj_function_value_list):
                #print("Goes to Convergence here.... Iteration: %d, Obj value %.4f" % (iter,obj_func_value))
                mat_indices = [
                    x for x in range(0, self.num_features)
                    if new_instance[0][x] != 0
                ]
                mat_data = [
                    new_instance[0][x] for x in range(0, self.num_features)
                    if new_instance[0][x] != 0
                ]

                #plt.plot(iteration_lst,objective_lst)
                return Instance(
                    -1,
                    RealFeatureVector(self.num_features, mat_indices,
                                      mat_data))

            # does not satisfy convergence requirement
            # store onto the list
            elif obj_func_value < obj_function_value_list[-1]:
                obj_function_value_list.append(obj_func_value)

            if not (new_instance == candidate_attack_instances[-1]).all():
                candidate_attack_instances.append(new_instance)

            attacker_score = new_attacker_score
            grad_update = new_grad_update

        #print("Convergence has not been found..")
        #plt.plot(iteration_lst, objective_lst)
        mat_indices = [
            x for x in range(0, self.num_features)
            if candidate_attack_instances[-1][0][x] != 0
        ]
        mat_data = [
            candidate_attack_instances[-1][0][x]
            for x in range(0, self.num_features)
            if candidate_attack_instances[-1][0][x] != 0
        ]

        return Instance(
            -1, RealFeatureVector(self.num_features, mat_indices, mat_data))