def collect_data(ws: np.ndarray) -> Tuple[float, float]: ############### # val1: use all windows; this provides a control/reference from which to measure difference to val2 # (should be 0.0 when using toy corpus) ############### # val1 x1 = ws[:, -2] # all words y1 = ws[:, -2 + DISTANCE] # neighbors val1i = drv.entropy_conditional(x1, y1).item() / drv.entropy(x1).item() ############### # val2: use only target windows # in theory, this should be invariant to number of target types, ############### # target windows row_ids = np.isin(ws[:, -2], target_ids) target_windows = ws[row_ids] # val2 x2 = target_windows[:, -2] # target y2 = target_windows[:, -2 + DISTANCE] # neighbors val2i = drv.entropy_conditional(x2, y2).item() / drv.entropy(x2).item() print(f'{len(ws):>12,} | val1={val1i:.3f} val2={val2i:.2f}') return val1i, val2i
def built_bestSplitPoint(data, attr, Class, gainD=None): ''' :param data:sorted data frame by the attr :param attr:the data frame column that you need to split :param Class:class column :return:best split point and the gain ''' list1 = data[attr].to_list() if len(list1) == 1: return (list1[0], 0) entropyD = drv.entropy(data[Class].to_list()) bestS = (list1[0] + list1[1]) / 2 firstS = data.loc[data[attr] <= bestS] lastS = data.loc[data[attr] > bestS] if gainD is None: infoD = (len(firstS[attr]) / len(list1)) * drv.entropy( firstS[Class].to_list()) + (len(lastS[attr]) / len(list1) ) * drv.entropy(lastS[Class].to_list()) gainD = entropyD - infoD for i in range(1, len(list1) - 1): best = (list1[i] + list1[i + 1]) / 2 firstS = data.loc[data[attr] <= best] lastS = data.loc[data[attr] > best] infoD = (len(firstS[attr]) / len(list1)) * drv.entropy( firstS[Class].to_list()) + (len(lastS[attr]) / len(list1) ) * drv.entropy(lastS[Class].to_list()) gain = entropyD - infoD if gain >= gainD: bestS = best gainD = gain return (bestS, entropyD)
def built_EntropyBased(data, attr, Class, k): ''' :param data: pandas data frame :param attr:the attribute to discritize :param Class:class attribute in the data frame :param k:number of buns :return: Entropy tree with 2^([log(k)]+1) leaves using built entroby method from pyitlib library ''' data = data.sort_values(by=attr) EntTree = Tree(data) depth = log2(k) if (int(depth) - depth) != 0: depth = int(depth) + 1 for i in range(int(depth)): bins = EntTree.getLeafs() for b in bins: data = b.getRoot() split = bestSplitPoint(data, attr, Class) b.setSplit(split[0]) b.setEntropy(split[1]) b.setLeft(Tree(data.loc[data[attr] <= split[0]])) b.setRight(Tree(data.loc[data[attr] > split[0]])) leafs = EntTree.getLeafs() for i in leafs: i.setEntropy(drv.entropy(i.getRoot()[Class].to_list())) return EntTree
def compare_with_all_patterns(self, target_histogram): if self.does_array_contains_nan(target_histogram): return 0 outcome_max = {} outcome_min = {} for c in self.__classes_of_obj: outcome_max[c] = 0.0 outcome_min[c] = 8.0 for p in self.__patterns: self.__mean_histogram_from_data = p['histogram']['data'] if self.does_array_contains_nan(self.__mean_histogram_from_data): continue mutual_inf = self.calculate_mutual_information( target_histogram) / drv.entropy( self.__mean_histogram_from_data, 2) for c in outcome_max: if c == p['object'] and outcome_max[c] < mutual_inf: outcome_max[c] = mutual_inf if c == p['object'] and outcome_min[c] > mutual_inf > 0: outcome_min[c] = mutual_inf return outcome_max, outcome_min
def calc_nmi_score(labels_true, labels_pred): """calculate normalized mutual information score Parameters ---------- labels_true: labels from ground truth labels_pred: labels from clustering Return ------- nmi: normalized mutual information score """ H_true = drv.entropy(labels_true, base=2) H_pred = drv.entropy(labels_pred, base=2) H_joint = drv.entropy_joint([labels_true, labels_pred], base=2) mi = H_true + H_pred - H_joint nmi = mi / max(H_true, H_pred) return nmi
def SU(numero, feature, solution): #print numero #Hfr = drv.entropy(feature, fill_value=None) #print "Entropy FR:{0:.3f}".format(Hfr) #Hfr_sol = drv.entropy_conditional(feature, solution, fill_value=None) #print "Entropy FR|I:{0:.3f}".format(Hfr_sol) #HI = drv.entropy(solution, fill_value=None) #print "Entropy I:{0:.3f}".format(HI) featureDisc = pd.cut(feature, 30, labels=False) IG = drv.information_mutual(featureDisc, solution) #print "IG:{0:.3f}".format(IG) #IG = Hfr-Hfr_sol #print IG den = drv.entropy(featureDisc) + drv.entropy(solution) #print "Den:{0:.3f}".format(den) result = 2 * (IG / den) #print "Result:{0:.3f}".format(result) return result
def compute_discrete_Lmeasure(self): """Function to compute the un-normalized L-measure between the all the discrete feature pairs. The value for all the possible pairs is stored in the L_measures dict. Auxiliary values like the mutual information (I_mutinfo) are also in their respective dicts for all the possible pairs. This method sets the `feats_pairs_dict` class attribute. Args: None Returns: None """ # TAKE note: the function expects the array to be in a transpose form indi_entropies = drv.entropy(self.data_arr.T, estimator=self.ent_estimator) # indi_entropies = drv.entropy(self.data_arr.T) num_rand = self.data_arr.shape[ 1] # Number of random variables (feature columns) assert num_rand == len(indi_entropies) L_measures = {} # Dictionary storing the pairwise L-measures I_mutinfo = {} # Dictionary storing the pairwise mutual information # mu_vals = {} # Dictionary storing the pairwise MU values for i in range(num_rand): for j in range(i + 1, num_rand): key = (i, j) # since 0-indexed h_i = indi_entropies[i] h_j = indi_entropies[j] # mu_ij = self.get_discrete_mu(i, j) # Potential error: I_ij may come out negative depending on the estiamtor I_ij = drv.information_mutual(self.data_arr.T[i], self.data_arr.T[j], estimator=self.ent_estimator) W_ij = min(h_i, h_j) num = (-2.0 * I_ij * W_ij) den = (W_ij - I_ij) eps = 1e-9 # epsilon value for denominator inner_exp_term = num / (den + eps) # removing numerical errors by upper bounding exponent by 0 inner_exp_term = min(0, inner_exp_term) L_measures[key] = np.sqrt(1 - np.exp(inner_exp_term)) I_mutinfo[key] = I_ij # print(I_ij, W_ij, num, den) # print(key, L_measures[key], inner_exp_term) # print('\n') self.L_measure_dict = L_measures return
def calculate_weights(self, discretized_data: pd.DataFrame): """ Provide calculation of link strength according mutual information between node and its parent(-s) values. """ import bamt.utils.GraphUtils as gru if not all([ i in ['disc', 'disc_num'] for i in gru.nodes_types(discretized_data).values() ]): logger_network.error( f"calculate_weghts() method deals only with discrete data. Continuous data: " + f"{[col for col, type in gru.nodes_types(discretized_data).items() if type not in ['disc', 'disc_num']]}" ) if not self.edges: logger_network.error( "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" ) if not self.nodes: logger_network.error( "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" ) weights = dict() for node in self.nodes: parents = node.cont_parents + node.disc_parents if parents is None: continue y = discretized_data[node.name].values if len(parents) == 1: x = discretized_data[parents[0]].values LS_true = drv.information_mutual(X=y, Y=x) entropy = drv.entropy(X=y) weight = LS_true / entropy weights[(parents[0], node.name)] = weight else: for parent_node in parents: x = discretized_data[parent_node].values other_parents = [ tmp for tmp in parents if tmp != parent_node ] z = list() for other_parent in other_parents: z.append(list(discretized_data[other_parent].values)) LS_true = np.average( drv.information_mutual_conditional( X=y, Y=x, Z=z, cartesian_product=True)) entropy = np.average( drv.entropy_conditional( X=y, Y=z, cartesian_product=True)) + 1e-8 weight = LS_true / entropy weights[(parent_node, node.name)] = weight self.weights = weights
def theils_u(self, x, y): s_xy = drv.entropy_conditional(x, y) x_counter = Counter(x) total_occurrences = sum(x_counter.values()) p_x = list(map(lambda n: n / total_occurrences, x_counter.values())) s_x = drv.entropy(p_x) if s_x == 0: return 1 else: return (s_x - s_xy) / s_x
def H(self): """The entropies of all variables. a pandas Series if df was a pandas dataframe, else a 1D numpy array""" if self._H is None: # Using pyitlib to compute H (hopefully efficiently) # Unfortunately this does not work with numpy arrays, convert to pandas TODO report # note: we convert to string type to avoid a bug with ints. TODO... self._H = drv.entropy(self.dataset_df.T.astype(str)) if not self.is_nparray: self._H = pd.Series(self._H, index=self.varnames) # basic sanity check: should all be positive assert np.all(self._H >= 0) return self._H
def bestIGattr(data, attributes, toSplit=False): """ :param data: :param attributes: :param toSplit: :return: best choice by gain """ classEntropy = drv.entropy(data['class']).item(0) attrsIG = {} for attr in attributes: attrsIG[attr] = find_entropy(data) - find_entropy_attribute(data, attr) maxGain = max(attrsIG.values()) for attr in attrsIG: if attrsIG[attr] == maxGain: return attr
def get_entropy_d(x): ''' Returns the get_entropy of the X. Parameters ---------- X : array-like, shape (n_samples) The data the get_entropy of which is computed k : int, optional number of nearest neighbors for density estimation Notes ----- Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of get_entropy of a random vector. Probl. Inf. Transm. 23, 95-101. See also: Evans, D. 2008 A computationally efficient estimator for mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. and: Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual information. Phys Rev E 69(6 Pt 2):066138. ''' return drv.entropy(x)
def compute_gains(): with open('train.json', 'rb') as f: data = f.readlines() data = [json.loads(line) for line in data] #convert string to dict format df = pd.DataFrame(data) #load into dataframe services_df = df.services.apply(pd.Series) flattened_services = services_df.merge( df, left_index=True, right_index=True).drop( ['services'], axis=1).melt(id_vars=['device_class', 'device_id'], value_name="services").drop(['variable'], axis=1).dropna() flattened_services['services'] = flattened_services.apply( lambda x: extract_port(x['services']), axis=1) flattened_services = flattened_services.dropna() h_x = drv.entropy(df['device_class']) information_gain = [] services_top_freq = flattened_services.groupby( ['services'])['device_id'].agg({ "service_count": len }).sort_values("service_count", ascending=False).head(4000).reset_index() #services = flattened_services.groupby('services').agg('count').nlargest(500, columns=['device_id']) services_top_freq.apply(lambda x: information_gain.append( tuple((x['services'], calculate_gain(x['services'], flattened_services, h_x)))), axis=1) information_gain.sort(key=lambda p: p[1], reverse=True) with open('port_gains.json', 'w') as f: print(information_gain) json.dump(information_gain, f) # '[1, 2, [3, 4]]'
def run(traces: np.ndarray, plains: np.ndarray, keys: np.ndarray, attack_traces: np.ndarray, subkey: int, debug_mode_enabled: bool = False) -> List[int]: """ The run method of pia :param traces: the traces to use :param plains: the plaintexts to use :param keys: the keys to use :param attack_traces: the traces to use for attacking :param subkey: the subkey index to analyze. Must be in the range [0-15]. :param debug_mode_enabled: whether to enable debug mode :return: the calculated subkey corresponding to the subkey index specified """ print("Executing Perceived Information Analysis") bar = progressbar.ProgressBar( max_value=len(attack_traces[0]) * (16 if subkey == 16 else 1), widgets=progress_bar_util.get_widgets(debug_mode_enabled)) warnings.simplefilter("ignore", category=RuntimeWarning) perceived_information = [0] * len(attack_traces[0]) max_pia = -float("inf") bar.start() indices = [subkey] if subkey == 16: indices = range(subkey) for i in indices: subkeys = [0] * len(keys) for j in range(len(keys)): subkeys[j] = int(keys[j][i]) dummy_interp1d = interp1d(range(2), range(2)) leakage_per_byte_value_matrix = [ list() for _ in range(Pia.KEY_SIZE) ] model_sampled_pdf_per_byte_value_array = np.array( [dummy_interp1d for _ in range(Pia.KEY_SIZE)]) for j in range(len(traces[0])): for k in range(len(traces)): key = subkeys[k] plain = plains[k][i] byte = key ^ plain leakage_per_byte_value_matrix[byte].append(traces[k][j]) for j in range(len(leakage_per_byte_value_matrix)): model_mu, model_std = norm.fit( leakage_per_byte_value_matrix[j]) model_sampled_pdf_per_byte_value_array[j] = Pia.sample_pdf( model_mu, model_std, 10) scaling_factor = 1.0 / (len(traces) * len(attack_traces)) for j in range(len(attack_traces[0])): column = attack_traces[:, j] chip_mu, chip_std = norm.fit(column) chip_sampled_pdf = Pia.sample_pdf(chip_mu, chip_std, 10) pia = 0 for cell in column: for k in range(Pia.KEY_SIZE): model_sampled_pdf = model_sampled_pdf_per_byte_value_array[ k] if not np.isclose(chip_std, 0.0): model_probability = model_sampled_pdf(cell) chip_probability = chip_sampled_pdf(cell) # The sampling sometimes returns a negative probability. Correct this. if model_probability <= 0.0: model_probability = 0.000001 if chip_probability <= 0.0: chip_probability = 0.0 pia += chip_probability * math.log2( model_probability) pia *= scaling_factor _, bin_edges = np.histogram(column, bins='auto') bin_values = np.digitize(column, bin_edges) o = np.array(bin_values, dtype=int) shannon_entropy = drv.entropy(o) pia = shannon_entropy - pia perceived_information[j] += pia if bar.value < bar.max_value: bar.update(bar.value + 1) if pia > max_pia: max_pia = pia for i in range(len(perceived_information)): perceived_information[i] /= max_pia * (16 if subkey == 16 else 1) bar.finish() plt.plot(perceived_information) plt.show() warnings.simplefilter("default") print("Done!") return perceived_information
def dcimig(factors, codes, continuous_factors=True, nb_bins=10): ''' DCIMIG metric from A. Sepliarskaia, J. Kiseleva, and M. de Rijke, “Evaluating disentangled representations,” arXiv:1910.05587, 2020. :param factors: dataset of factors each column is a factor and each line is a data point :param codes: latent codes associated to the dataset of factors each column is a latent code and each line is a data point :param continuous_factors: True: factors are described as continuous variables False: factors are described as discrete variables :param nb_bins: number of bins to use for discretization ''' # count the number of factors and latent codes nb_factors = factors.shape[1] nb_codes = codes.shape[1] # quantize factors if they are continuous if continuous_factors: factors = minmax_scale(factors) # normalize in [0, 1] all columns factors = get_bin_index(factors, nb_bins) # quantize values and get indexes # quantize latent codes codes = minmax_scale(codes) # normalize in [0, 1] all columns codes = get_bin_index(codes, nb_bins) # quantize values and get indexes # compute mutual information matrix mi_matrix = np.zeros((nb_factors, nb_codes)) for f in range(nb_factors): for c in range(nb_codes): mi_matrix[f, c] = get_mutual_information(factors[:, f], codes[:, c], normalize=False) # compute the gap for all codes for c in range(nb_codes): mi_c = np.sort(mi_matrix[:, c]) max_idx = np.argmax(mi_matrix[:, c]) # get diff between highest and second highest term gap gap = mi_c[-1] - mi_c[-2] # replace the best by the gap and the rest by 0 mi_matrix[:, c] = mi_matrix[:, c] * 0 mi_matrix[max_idx, c] = gap # find the best gap for each factor gap_sum = 0 for f in range(nb_factors): gap_sum += np.max(mi_matrix[f, :]) # sum the entropy for each factors factor_entropy = 0 for f in range(nb_factors): factor_entropy += drv.entropy(factors[:, f]) # compute the mean gap dcimig_score = gap_sum / factor_entropy return dcimig_score
#pos = nx.spring_layout(G) #nx.draw_networkx(G,pos,node_size=5,alpha=0.5,with_labels=False) #plt.show() partition = community.best_partition(G) #print partition part = collections.OrderedDict( sorted(partition.items(), key=lambda x: int(x[0]))) rever = part #print rever lista = [v for v in rever.values()] #print lista communes = [] for i in range(16): for j in range(40): communes.append(i) #print communes c = drv.entropy(lista) eta = drv.information_mutual(lista, communes) eta = eta / c print p, eta #nmi = normalized_mutual_info_score(lista,communes,average_method='arithmetic') #print p,nmi
def info_gain(feature_vals: np.ndarray, y_vals: np.ndarray) -> float: h_y = drv.entropy(y_vals) h_y_given_x = drv.entropy_conditional(y_vals, feature_vals) return h_y - h_y_given_x
def get_entropy(dataset, sensitive_attr, top_n=5): sensitive_index = dataset.feature_names.index(sensitive_attr) res = [] #Independent entropy res.append(drv.entropy(dataset.features[:, sensitive_index])) res.append(drv.entropy(dataset.labels[:, 0])) entropy_feats = [] for i in range(0, dataset.features.shape[1]): if i == sensitive_index: continue entropy_feats.append(drv.entropy(dataset.features[:, i])) entropy_feats.sort(reverse=True) res += entropy_feats[:5] res += entropy_feats[-5:] #Independent entropy #Cross entropy res.append( drv.entropy_conditional(dataset.features[:, sensitive_index], dataset.labels[:, 0])) res.append( drv.entropy_conditional(dataset.labels[:, 0], dataset.features[:, sensitive_index])) cross_entropy_A = [] cross_entropy_B = [] for i in range(0, dataset.features.shape[1]): if i == sensitive_index: continue cross_entropy_A.append( drv.entropy_conditional(dataset.features[:, sensitive_index], dataset.features[:, i])) cross_entropy_B.append( drv.entropy_conditional(dataset.features[:, i], dataset.features[:, sensitive_index])) cross_entropy_A.sort(reverse=True) cross_entropy_B.sort(reverse=True) res += cross_entropy_A[:5] res += cross_entropy_A[-5:] res += cross_entropy_B[:5] res += cross_entropy_B[-5:] cross_entropy_A = [] cross_entropy_B = [] for i in range(0, dataset.features.shape[1]): if i == sensitive_index: continue cross_entropy_A.append( drv.entropy_conditional(dataset.labels[:, 0], dataset.features[:, i])) cross_entropy_B.append( drv.entropy_conditional(dataset.features[:, i], dataset.labels[:, 0])) cross_entropy_A.sort(reverse=True) cross_entropy_B.sort(reverse=True) res += cross_entropy_A[:5] res += cross_entropy_A[-5:] res += cross_entropy_B[:5] res += cross_entropy_B[-5:] #Cross entropy for i in range(0, len(res)): res[i] = float(res[i]) return res
cov_enc_ver = cov_enc_ver / ((rows - 1) * (cols) * 1.0000) coef_ver = cov_ver / ((math.pow(Dx_ver, 0.5) * math.pow(Dy_ver, 0.5)) * 1.0000) coef_enc_ver = cov_enc_ver / ( (math.pow(Dx_enc_ver, 0.5) * math.pow(Dy_enc_ver, 0.5)) * 1.0000) print "\nii) Vertical" print "The correlation coefficient of original image is:", coef_ver print "The correlation coefficient of encrypted image is:", coef_enc_ver print "ENTROPY" img_ent = img.flatten() #convert to 1-D vector msg_ent = msg.flatten() from pyitlib import discrete_random_variable as drv entropy = drv.entropy(img_ent) entropy_enc = drv.entropy(msg_ent) print "The entropy value of original image is:", entropy print "The entropy value of encrypted image is:", entropy_enc # print "ANALYSIS AGAINST ATTACKS" # print "1] Additive noise" # pad=240 # def to_std_float(img): # img.astype(np.float16, copy = False) # img = np.multiply(img, (1/255)) # return img # def to_std_uint8(img):
def _jmim(selected_feature, feature_set, num_to_select, labels, score_list): ### # I(x,y;c) = H(x|c) - [ H(x,c,y) - H(c,y) ] + I(y;c) # #### start = datetime.datetime.now() col = list(feature_set) pool = [] for i in col: candidate_f = feature_set[i] candidate_f = np.reshape(candidate_f.values, (1, -1)) min_jmi = 1000000000 min_feature = [] index = 0 I_xy_c = 0 for sf_packge in selected_feature: # print('round start at ' + str(datetime.datetime.now())) sf = sf_packge[1] sf_idx = sf_packge[0] I_yc = score_list.iloc[sf_idx, 1] sf = np.reshape(sf, (1, -1)) labels = np.reshape(labels, (1, -1)) H_c = drv.entropy(labels) H_x_c = drv.entropy_conditional(candidate_f, labels) xcy = np.append([candidate_f, labels], [sf], axis=0) H_xcy = drv.entropy_joint(xcy) cy = np.append([labels], [sf], axis=0) H_cy = drv.entropy_joint(cy) H_y_c = drv.entropy_conditional(sf, labels) H_cy2 = H_y_c + H_c I_xy_c = H_x_c - (H_xcy - H_cy) + I_yc labels = np.reshape(labels, (-1, 1)) if I_xy_c < min_jmi: min_jmi = I_xy_c min_feature = candidate_f index = int(i) # print(I_xy_c) if I_xy_c < 0: print() pool.append([index, min_feature, min_jmi]) # print('round end at ' + str(datetime.datetime.now())) max_candidate_score = 0 max_candidate_idx = 0 max_candidate = [] for candidate in pool: if float(candidate[2]) > max_candidate_score: max_candidate = candidate[1] max_candidate_idx = candidate[0] max_candidate_score = float(candidate[2]) selected_feature.append( [max_candidate_idx, max_candidate, max_candidate_score]) feature_set.drop(columns=[str(max_candidate_idx)], inplace=True) print( str(len(selected_feature)) + ' ' + str(max_candidate_idx) + ' ' + str(max_candidate_score) + ' at ' + str(datetime.datetime.now() - start)) return selected_feature, feature_set