def csr_mat_to_instances(csr_mat, labels, binary=False): """ Return a list of instances :param nd_arr: :param labels: :return: """ data = csr_mat.data indices = csr_mat.indices indptr = csr_mat.indptr instance_len, num_features = csr_mat.shape instance_lst = [] for i in range(instance_len): label = labels[i] instance_data = data[indptr[i]:indptr[i + 1]] instance_indices = list(indices[indptr[i]:indptr[i + 1]]) if binary: instance_lst.append( Instance(label, BinaryFeatureVector(num_features, instance_indices))) else: instance_lst.append( Instance( label, RealFeatureVector(num_features, instance_indices, instance_data))) return instance_lst
def find_centroid(instances: List[Instance]): num_features = instances[0].get_feature_vector().feature_count indices = [] data = [] for i in range(num_features): sum = 0 for instance in instances: if instance.label == -1: sum += instance.get_feature_vector().get_feature(i) sum /= num_features if sum != 0: indices.append(i) data.append(sum) return Instance(-1,RealFeatureVector(num_features, indices, data))
def find_min(instances: List[Instance]): num_features = instances[0].get_feature_vector().feature_count indices = [] data = [] for i in range(num_features): min = 1000 for instance in instances: value = instance.get_feature_vector().get_feature(i) if value <= min: min = value if min != 0: indices.append(i) data.append(min) return RealFeatureVector(num_features, indices, data)
def find_max(instances: List[Instance]): num_features = instances[0].get_feature_vector().feature_count indices = [] data = [] for i in range(num_features): max = 0 for instance in instances: value = instance.get_feature_vector().get_feature(i) if value >= max: max = value if max != 0: indices.append(i) data.append(max) return RealFeatureVector(num_features, indices, data)
def _generate_inst(self): """ :return: a properly generated Instance that has feature vector self.x and label self.y """ indices = [] data = [] for i, val in enumerate(self.x): if val != 0: indices.append(i) data.append(val) # Generate new instance fv = RealFeatureVector(len(self.x), indices, data) self.inst = Instance(self.y, fv)
def coordinate_greedy(self, instance: Instance) -> Instance: """ Greddily update the feature to incrementally improve the attackers utility. run CS from L random starting points in the feature space. We repeat the alternation until differences of instances are small or max_change is reached. no_improve_count: number of points Q: transofrm cost(we use quodratic distance) GreedyImprove: using the coordinate descent algorithm. :param instance: :return: if the result is still classified as +1, we return origin instance else we return the improved. """ indices = [i for i in range(0, self.num_features)] x = xk = instance.get_csr_matrix().toarray()[0] no_improve_count = 0 shuffle(indices) count = 0 for i in indices: xkplus1 = self.minimize_transform(xk, x, i) oldQ = self.transform_cost(xk, x) newQ = self.transform_cost(xkplus1, x) # step_change = np.log(newQ) / np.log(oldQ) # using difference instead of log ratio for convergence check xk = xkplus1 no_improve_count += 1 if newQ - oldQ > 0 and oldQ != 0: step_change = np.log(newQ - oldQ) if step_change <= self.epsilon: break if no_improve_count > self.max_change: break count += 1 mat_indices = [x for x in range(0, self.num_features) if xk[x] != 0] mat_data = [xk[x] for x in range(0, self.num_features) if xk[x] != 0] new_instance = Instance( -1, RealFeatureVector(self.num_features, mat_indices, mat_data)) return new_instance
def load_dataset(emailData: EmailDataset) -> List[Instance]: """ Conversion from dataset object into a list of instances :param emailData: """ instances = [] num_features = emailData.shape[1] indptr = emailData.features.indptr indices = emailData.features.indices data = emailData.features.data for i in range(0, emailData.num_instances): if emailData.binary: tmp_vector = BinaryFeatureVector(num_features, indices[indptr[i]:indptr[i + 1]].tolist()) else: instance_data = data[indptr[i]:indptr[i + 1]].tolist() tmp_vector = RealFeatureVector(num_features, indices[indptr[i]:indptr[i + 1]].tolist(), instance_data) instances.append(Instance(emailData.labels[i], tmp_vector)) return instances
def nd_arr_to_instances(nd_arr, labels=None, binary=False): """ Return a list of instances :param nd_arr: :param labels: :param binary: :return: """ num_instances = nd_arr.shape[0] if labels is None: labels = nd_arr[:, :1] data = nd_arr[:, 1:] num_features = nd_arr.shape[1] - 1 else: data = nd_arr num_features = nd_arr.shape[1] instance_lst = [] for i in range(num_instances): if binary: mat_indices = [ x for x in range(0, num_features) if data[i][x] != 0 ] instance_lst.append( Instance(labels[i], BinaryFeatureVector(num_instances, mat_indices))) else: mat_indices = [ x for x in range(0, num_features) if data[i][x] != 0 ] mat_data = [ data[i][x] for x in range(0, num_features) if data[0][x] != 0 ] instance_lst.append( Instance( labels[i], RealFeatureVector(num_instances, mat_indices, mat_data))) return instance_lst
def coordinate_greedy(self, instance: Instance): """ Greedily update the feature to incrementally improve the attackers utility. run CS from L random starting points in the feature space. We repeat the alternation until differences of instances are small or max_change is reached. no_improve_count: number of points Q: transofrm cost(we use quodratic distance) GreedyImprove: using the coordinate descent algorithm. :param instance: :return: if the result is still classified as +1, we return origin instance else we return the improved. """ instance_len = instance.get_feature_count() if DEBUG: iteration_list = [] Q_value_list = [] x = xk = instance.get_csr_matrix().toarray()[0] # converge is used for checking convergance conditions # if the last convergence_time iterations all satisfy <= eplison condition # ,the attack successfully finds a optimum converge = 0 for iteration_time in range(self.max_iteration): i = randint(0, instance_len - 1) #calcualte cost function and greediy improve from a random feature i xkplus1 = self.minimize_transform(xk, x, i) old_q = self.transform_cost(xk, x) new_q = self.transform_cost(xkplus1, x) # check whether Q_value actually descends and converges to a minimum # plot the iteration and Q_values using matplotlib #if DEBUG: # iteration_list.append(iteration_time) # Q_value_list.append(new_q) # if new_q < 0: # print("Attack finishes because Q is less than 0") # break if new_q - old_q <= 0: xk = xkplus1 step_change = old_q - new_q # the np.log() may not converge in special cases # makes sure the cost function actually converges # alternative implementation? #step_change = np.log(new_q) / np.log(old_q) #step_change = np.log(old_q - new_q) if step_change <= self.epsilon: converge += 1 if converge >= self.convergence_time: #print("Attack finishes because of convergence!") break #if DEBUG: # plt.plot(iteration_list,Q_value_list) mat_indices = [x for x in range(0, self.num_features) if xk[x] != 0] mat_data = [xk[x] for x in range(0, self.num_features) if xk[x] != 0] new_instance = Instance( -1, RealFeatureVector(self.num_features, mat_indices, mat_data)) return new_instance
def attack(self, instances) -> List[Instance]: """ Performs a data modification attack :param instances: the input instances :return: the attacked instances """ if len(instances) == 0: raise ValueError('Need at least one instance.') self.instances = instances self.return_instances = deepcopy(self.instances) self._calculate_constants() fv_dist = 0.0 theta_dist = np.linalg.norm(self.theta - self.target_theta) iteration = 0 old_update_vector = 0 while (iteration == 0 or (fv_dist > self.alpha and iteration < self.max_iter)): print('Iteration: ', iteration, ' - FV distance: ', fv_dist, ' - theta distance: ', theta_dist, ' - beta: ', self.beta, sep='') begin = time.time() self._write_to_file() # Gradient descent with momentum gradient = self._calc_gradient() if self.verbose: print('\nGradient:\n', gradient, sep='') update_vector = (self.eta * old_update_vector + (1 - self.eta) * gradient) self.fvs -= self.beta * update_vector self._project_fvs() if self.verbose: print('\nFeature Vectors:\n', self.fvs, '\n', sep='') # Update variables self._calc_theta() fv_dist = np.linalg.norm(self.fvs - self.old_fvs) theta_dist = np.linalg.norm(self.theta - self.target_theta) self.old_fvs = deepcopy(self.fvs) self.beta *= 1 / (1 + self.decay * iteration) old_update_vector = deepcopy(update_vector) self._cleanup_files() end = time.time() print('TIME: ', end - begin, 's', sep='') iteration += 1 print('Iteration: FINAL - FV distance: ', fv_dist, ' - theta distance: ', theta_dist, ' - alpha: ', self.alpha, ' - beta: ', self.beta, '\n', sep='') if self.verbose: print('\n\nTarget Theta:\n\n', self.target_theta, '\n\nTheta:\n\n', self.theta, '\n') # Build appropriate RealFeatureVectors feature_count = self.fvs.shape[1] for i, fv in enumerate(self.fvs): indices = [] data = [] for j, val in enumerate(fv): if val != 0: indices.append(j) data.append(val) self.return_instances[i].feature_vector = RealFeatureVector( feature_count, indices, data) return self.return_instances
def gradient_descent(self, instance: Instance, neg_instances): #store iteration and objective values for plotting.... #iteration_lst = [] #objective_lst = [] # attack_intance-> np array attack_instance = instance.get_csr_matrix().toarray() root_instance = attack_instance obj_function_value_list = [] # store the modified gradient descent attack instances # find a list of potential neg_instances, the closest distance, and updated gradients candidate_attack_instances = [attack_instance] attacker_score = self.get_score(attack_instance) closer_neg_instances, dist, grad_update = self.compute_gradient( attack_instance, neg_instances) obj_func_value = attacker_score + self.lambda_val * dist obj_function_value_list.append(obj_func_value) for iter in range(self.max_iter): # no d(x,x_prime) is set to limit the boundary of attacks # compute the obj_func_value of the last satisfied instance # append to the value list #iteration_lst.append(iter) #objective_lst.append(obj_func_value) past_instance = candidate_attack_instances[-1] new_instance = self.update_within_boundary(past_instance, root_instance, grad_update) # compute the gradient and objective function value of the new instance closer_neg_instances, dist, new_grad_update = \ self.compute_gradient(new_instance, closer_neg_instances) new_attacker_score = self.get_score(new_instance) obj_func_value = new_attacker_score + self.lambda_val * dist # check convergence information # we may reach a local min if the function value does not change # if obj_func_value == obj_function_value_list[-1]: # print("Local min is reached. Iteration: %d, Obj value %d" %(iter,obj_func_value)) # mat_indices = [x for x in range(0, self.num_features) if new_instance[0][x] != 0] # mat_data = [new_instance[0][x] for x in range(0, self.num_features) if new_instance[0][x] != 0] # return Instance(-1, RealFeatureVector(self.num_features, mat_indices, mat_data)) # check a small epsilon(difference is a small value after # several iterations) if self.check_convergence_info(obj_func_value, obj_function_value_list): #print("Goes to Convergence here.... Iteration: %d, Obj value %.4f" % (iter,obj_func_value)) mat_indices = [ x for x in range(0, self.num_features) if new_instance[0][x] != 0 ] mat_data = [ new_instance[0][x] for x in range(0, self.num_features) if new_instance[0][x] != 0 ] #plt.plot(iteration_lst,objective_lst) return Instance( -1, RealFeatureVector(self.num_features, mat_indices, mat_data)) # does not satisfy convergence requirement # store onto the list elif obj_func_value < obj_function_value_list[-1]: obj_function_value_list.append(obj_func_value) if not (new_instance == candidate_attack_instances[-1]).all(): candidate_attack_instances.append(new_instance) attacker_score = new_attacker_score grad_update = new_grad_update #print("Convergence has not been found..") #plt.plot(iteration_lst, objective_lst) mat_indices = [ x for x in range(0, self.num_features) if candidate_attack_instances[-1][0][x] != 0 ] mat_data = [ candidate_attack_instances[-1][0][x] for x in range(0, self.num_features) if candidate_attack_instances[-1][0][x] != 0 ] return Instance( -1, RealFeatureVector(self.num_features, mat_indices, mat_data))