def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50, noise_sigma='all'): #import pdb; pdb.set_trace() no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y))) Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X) Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y) s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X)) s.calculate_entropies() # MINE mine = MINE() mine.compute_score(X.flatten(), Y.flatten()) # Linear regression slope, intercept, r, p, stderr = \ scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx]) #import pdb; pdb.set_trace() if title is not None: print(title) print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" % (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
def mic_method(data_x, data_y, feat_labels, mic_threshold, is_split=1): # 缺失值填充 # data_x = data_x.fillna(data_x.mean()) data_x = data_x.fillna(0) data_x = data_x.values # 归一化,之前必须保证没有空值,之后自动变成ndarray scaler = MinMaxScaler() data_x = scaler.fit_transform(data_x) # dataframe变成没有标签的ndarray,以便可以输入模型 data_y = data_y.values if is_split == 1: # 先把onehot列单独拿出来 # onehot_data_x_left = data_x[:, :30] data_x_mid = data_x[:, 30:454] # onehot_data_x_right = data_x[:, 454:] else: data_x_mid = data_x # 最大信息系数法,注意,这个是针对分类问题使用的 # 选择K个最好的特征,返回选择特征后的数据 m = MINE() x = np.random.uniform(-1, 1, 10000) # m.compute_score(x, x ** 2) m.compute_score(data_x_mid, data_y) print(m.mic()) xxx = SelectKBest(chi2, k=mic_threshold).fit(data_x_mid, data_y) selected_data_x = SelectKBest(chi2, k=mic_threshold).fit_transform( data_x_mid, data_y) return selected_data_x, data_y
def _evaluate_single(data, target_feature): mine = MINE(alpha=0.4, c=15) MICs = list() for i in range(data.shape[1]): mine.compute_score(target_feature,data[:,i]) MICs.append(mine.mic()) return(MICs)
def find_best_n_features_mic(n=8, out_path=''): # 计算MIC mine = MINE(alpha=0.6, c=15, est="mic_approx") mic_all = [] for i in range(x.shape[1]): xi = x[:, i] mine.compute_score(xi, y) mic_all.append(mine.mic()) # 找出8个最大的 best_n = [] best_n_mic = [] for i in range(n): best_position = np.nanargmax(mic_all) best_n.append(best_position) best_n_mic.append(copy.deepcopy(mic_all[best_position])) mic_all[best_position] = np.nan print('Found', n, 'features with largest MIC, whose positions are:') print(best_n) print() print('The MIC of these features are:') print(best_n_mic) print() best_features = x[:, best_n] print('Shape of features selected:', best_features.shape) best_features_with_label = pd.DataFrame( np.concatenate([best_features, y.reshape(len(y), 1)], axis=1)) out_path = out_path + 'mic_best_' + str(n) + '.csv' best_features_with_label.to_csv(out_path, header=None, index=None)
def chooseIndependantInputVariables(inArr): #print inArr selected_input_indexes = [] for i in range(inArr.shape[1]): doSelect = True for j in range(i): #Subrata for now choosing all inputs! commentout "break" later when you need it. #break # comment out this to select only independant inputs if(i == j): return x = inArr[:,i] y = inArr[:,j] #inputFeatureName1 = getInputParameterNameFromColumnIndex(i) inputFeatureName1 = getInputParameterNameFromFeatureIndex(i) #inputFeatureName2 = getInputParameterNameFromColumnIndex(j) inputFeatureName2 = getInputParameterNameFromFeatureIndex(j) #print "x: ", x x_scaled = preprocessing.scale(x) y_scaled = preprocessing.scale(y) #print "x: ", x_scaled #print "targetArr: ", targetArr mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, y_scaled) print "Correlation between ",inputFeatureName1,inputFeatureName2, " is ", mine.mic() if(float(mine.mic()) >= 0.99): doSelect = False print "\n ***** ==> will NOT select ", inputFeatureName1, " as it correlates with ", inputFeatureName2, "\n" #end for if(doSelect): selected_input_indexes.append(i) return selected_input_indexes
def get_correlation(dataset, target, features=set([])): if target is None: raise ValueError('corr() need target value') if not isinstance(dataset, pd.DataFrame): dataset = pd.DataFrame(dataset) if not features: features = set(dataset.columns) numerical = {} text = {} num_types = (np.dtype('float64'), np.dtype('int64'), np.dtype('bool')) target = dataset[target] mine = MINE() for col in features: if dataset.dtypes[col] in num_types: if dataset.dtypes[col] is np.dtype('bool'): dataset[col] = dataset[col].astype(int, copy=False) mine.compute_score(dataset[col], target) numerical[col] = mine.mic() else: text[col] = np.nan return { 'numerical': dict(sorted(numerical.items(), key=lambda d: d[1], reverse=True)), 'object': dict(sorted(text.items(), key=lambda d: d[1], reverse=True)) }
def get_mic(x, y): #get maximum information coefficient and pearson r value r = np.corrcoef(x, y)[0, 1] mine = MINE(alpha=0.4, c=15, est='mic_e') mine.compute_score(x, y) mic = mine.mic() return mic, r
def calculate_mic(df, y): max_info = MINE() mics ={} for column in df.columns: max_info.compute_score(df.loc[:, column], y.values) mics[column] = max_info.mic() return pd.Series(mics)
def _evaluate_single(data, target_feature): mine = MINE(alpha=0.3, c=15) MICs = list() for i in range(data.shape[1]): mine.compute_score(target_feature,data[:,i]) MICs.append(mine.mic()) return(MICs)
class MaximalInformationCorrelator(Correlator, ABC): """ Implements lag derivation for metric time series based on maximal information-based nonparametric exploration. Reference: https://minepy.readthedocs.io/en/latest/python.html """ @abstractmethod def _compute_correlation_internal(self): pass def __init__(self, config: dict): super().__init__(config) alpha = ErrorChecker.key_check_and_load('alpha', config, default=0.6) if alpha <= 0 or (alpha > 1 and alpha < 4): raise ValueError('Alpha should be in the range (0, 1] or [4, inf)') c = ErrorChecker.key_check_and_load('c', config, default=15) if c <= 0: raise ValueError('c has to be greater than 0') est = ErrorChecker.key_check_and_load('est', config, default='mic_approx') self.estimator = MINE(alpha=alpha, c=c, est=est) def _compute_correlation(self, metrics_vals_1: pd.Series, metrics_vals_2: pd.Series, lag: int): self.estimator.compute_score(metrics_vals_1, metrics_vals_2.shift(lag).fillna(0)) return self._compute_correlation_internal()
def MIC_plot(self, x, y, numRows, numCols, plotNum, x_name, y_name, filename): # build the MIC and correlation plot using the covariant matrix using a vectorized implementation. To be used when # categorical features are part of the model (otherwise, Pearson, Kendall and Spearman can be used) print "Pearson product-moment correlation coefficients np.corrcoef(x=",x_name,", y=",y_name,"): ",np.corrcoef(x, y) r = np.around(np.corrcoef(x, y)[0, 1], 1) # Pearson product-moment correlation coefficients. # TODO: compute cov matrix for each one-hot encoding variable of the categorical feature with # MINE's Mutual Information coefficient fig = plt.figure(figsize=(33,5), frameon=True)#, ms=50) mine = MINE(alpha=0.6, c=15, est="mic_approx") mine.compute_score(x, y) mic = np.around(mine.mic(), 1) ax = plt.subplot(numRows, numCols, plotNum) ax.set_xlim(xmin=min(x)+1, xmax=max(x)+1) ax.set_ylim(ymin=min(y)+1, ymax=max(y)+1) ax.set_title('Pearson r=%.1f\nMIC=%.1f Features %s and %s in %s' % (r, mic, x_name, y_name, filename),fontsize=10) ax.set_frame_on(False) ax.axes.get_xaxis().set_visible(True) ax.axes.get_yaxis().set_visible(True) ax.plot(x, y, '*') plt.xlabel('X') plt.ylabel('Y') # ax.set_xticks([]) # ax.set_yticks([]) # plt.scatter(x,y,s=s) # plt.show() return ax
def mic(dataset: pd.DataFrame, labels: np.array) -> dict: score = {feature: None for feature in dataset} for feature, x in dataset.items(): mine = MINE() mine.compute_score(x.values.ravel(), labels) score[feature] = mine.mic() return score
def mic(points): points = np.transpose(points) mine = MINE() mine.compute_score(points[0], points[1]) Mic = mine.mic() del points return Mic
def mutual_infomation_rank(col_names, X, y, topK=10): ''' 互信息特征重要性检测 :param col_names: 特征名,list :param X: 特征矩阵,numpy 2D array :param y: 标签向量,numpy array :param topK: 输出前k个变量 :return: 排序后的特征dataframe,含权重和置信度 ''' # 因为互信息计算较慢,进行采样后再计算 original_size = len(y) sampling_size = 2000 if original_size > 2000 else original_size X, y = resample(X, y, random_state=0, n_samples=sampling_size) mine = MINE(alpha=0.6, c=15, est="mic_approx") scores = [] for i in range(0, len(col_names)): mine.compute_score(X[:, i], y) scores.append(mine.mic()) result_df = pd.DataFrame({'name': col_names, 'mutual_information': scores}) result_df = result_df[['name', 'mutual_information' ]].sort_values('mutual_information', ascending=False) print "size={m} sampling={s} features={n} top{k} rank for MINE testing:" \ .format(m=original_size, s=sampling_size, n=len(col_names), k=topK) print result_df.head(topK) return result_df
def calculateCorrelationBetweenVectors(x,y): #x = scipy.array([-0.65499887, 2.34644428, 3.0]) #y = scipy.array([-1.46049758, 3.86537321, 21.0]) #The Pearson correlation coefficient measures the linear relationship between two datasets. #Strictly speaking, Pearson correlation requires that each dataset be normally distributed. #correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. #Correlations of -1 or +1 imply an exact linear relationship. #The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. #The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. #print "X = " , x, "\nY = ", y #corr, p_value = pearsonr(x, y) commonSize = 0 if(len(x) < len(y)): commonSize = len(x) else: commonSize = len(y) x_sorted = np.sort(x) y_sorted = np.sort(y) x_sorted = x_sorted[ : (commonSize - 1)] y_sorted = y_sorted[ : (commonSize - 1)] x_scaled = preprocessing.scale(x_sorted) y_scaled = preprocessing.scale(y_sorted) mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, y_scaled) corr = float(mine.mic()) #return #print "correlation :", corr return corr
def McOne(data, label, r): print("McOne start...") classLabel = label dataMat = data.values n = data.shape[0] micFC = [0] * n Subset = [-1] * n numSubset = 0 for i in range(n): m = MINE() m.compute_score(dataMat[i], classLabel) micFC[i] = m.mic() if micFC[i] >= r: Subset[numSubset] = i numSubset += 1 Subset = Subset[:numSubset] Subset.sort(key=lambda x: micFC[x], reverse=True) e = 0 while e <= numSubset - 1: q = e + 1 while q <= numSubset - 1: m = MINE() m.compute_score(dataMat[Subset[e]], dataMat[Subset[q]]) if m.mic() >= micFC[Subset[q]]: for i in range(q, numSubset - 1): Subset[i] = Subset[i + 1] numSubset -= 1 else: q += 1 e += 1 return data.iloc[Subset[:numSubset]]
def calMIC(data): for i in range(5): mine = MINE(alpha=0.6, c=15) miles = data[data.veh == (i + 2)].iloc[:, 1] weight = data[data.veh == (i + 2)].iloc[:, 2] mine.compute_score(miles, weight) print("Without noise:", "MIC", mine.mic())
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) : mic_map = {}; for dataFileName in dataFileArray : if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; if(neadNorm): _score_array =normizeDataSet(_score_array); #计算皮尔森相关系数 并输出成markdown形式 m = MINE() for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); if (neadNorm): dataArray = normizeDataSet(dataArray); m.compute_score(dataArray,_score_array); mic_map[headerArray[index]] = m.mic(); sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True); threhold = np.mean(list(mic_map.values())); for header,value in sorted_list: if value > threhold: print(header,value)
def micfliter(data,rate): """ MIC feture selection function Arguments: data: Pandas DataFrame of rate: Float in range(0,1) Return: List of to drop """ m = MINE() micmatrix = [] for colx in data.columns: micResult = [] for coly in data.columns: m.compute_score(np.array(data[coly]), np.array(data[colx])) micResult.append(m.mic()) micmatrix.append(micResult) micmatrix = pd.DataFrame(micmatrix,columns=data.columns) upper = micmatrix.where(np.triu(np.ones(micmatrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column]>rate)] return to_drop
def MicEvaluate(dataX, dataY, name, pre_indices): ''' 计算每一个条件属性与决策属性之间的最大信息系数 :param dataX: :param dataY: :param name: :return: ''' dataY = dataY.reshape(1, -1)[0] nFeatures = len(dataX[0]) print("输入特征数为:", nFeatures) coorArray = [] * nFeatures mine = MINE(alpha=0.6, c=15) for i in range(0, nFeatures): l = [x[i] for x in dataX] mine.compute_score(l, dataY) temp = mine.mic() coorArray.append(abs(temp)) print("上一层留下的每个特征的最大互信息系数", coorArray) coorIndex = np.argsort(coorArray) coorIndex_ = [] #返回最初的特征索引 for i in coorIndex: coorIndex_.append(pre_indices[i]) coorArray = np.array(coorArray) print("MIC相关系数:") print("特征:", dict(zip(name[coorIndex_], coorArray[coorIndex]))) name_coorArray = dict(zip(name[coorIndex_], coorArray[coorIndex])) return coorIndex_, coorArray, name_coorArray
def compute_MIC(x, y, alpha=0.6, c=15, all_metrics=False): from minepy import MINE mine = MINE(alpha, c) mine.compute_score(x, y) if all_metrics: return mine.mic(), mine else: return mine.mic()
def MIC(): train_x, train_y = MLdata('new_dataset.csv') mic = MINE() l = train_x.shape[1] print(l) for i in range(l): mic.compute_score(train_x[:,i], train_y) print(i, mic.mic())
def mine(): for column in tqdm(uncorrelated, desc="Running MINE test", dynamic_ncols=True, leave=False): mine = MINE() mine.compute_score(epigenomes[column].values.ravel(), labels.values.ravel()) score = mine.mic() if score >= correlation_threshold: uncorrelated.remove(column)
def MIC(self,x,y): mine = MINE(alpha=0.6,c=15,est="mic_approx") mine.compute_score(x,y) return mine.mic()
def feature_scoring(X, Y): names = ["x%s" % i for i in range(1, 37)] ranks = {} X = X.values[:, :] lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) print('startMIC') mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) print(i) ranks["MIC"] = rank_to_dict(mic_scores, names) print('finish MIc') r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print("\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods]))))
def mic(x, y): """ :param x: :param y: :return: """ m = MINE() m.compute_score(x, y) return (m.mic(), 0.5)
def mic_hq(X, y, cut=0.2): from minepy import MINE m = MINE() nf = X.shape[1] subs = np.array([False] * nf) for i in range(nf): m.compute_score(X[:,i], y) subs[i] = (m.mic() < cut) return(subs)
def _entropy_select(self): """ 互信息法 """ m = MINE() mic_array = np.zeros(self.sample_num) for i, x in enumerate(self.x.T): m.compute_score(x, self.y) mic_array[i] = m.mic() self._get_top_k_ids(mic_array)
def MIC(X, y): mics = [] for i in range(X.shape[1]): m = MINE() m.compute_score(X[:, i], y) mic = m.mic() mics += [ mic, ] return mics
def toolkit_mic(arr0, arr1, alpha=0.6, c=15): """MIC""" np_temp0 = np.array(arr0) np_temp1 = np.array(arr1) mine = MINE(alpha=0.6, c=15, est="mic_approx") mine.compute_score(np_temp0, np_temp1) return mine.mic()
def main(): """ """ #mine = MINE() #hsic_lasso = HSICLasso() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() root = 0 collist = [] for colname in glob.glob("./run*/COL*"): for line in open(colname): if "#" in line: continue line = line.split() t = float(line[0]) if t < 10000.0: continue elif 50000.0 < t: break collist.append(line) #break #MINEs, TICs = pstats(collist) #print(TICs) for i in range(len(collist[0])-1): for j in range(i+1, len(collist[0])-1): #for j in range(i+1, i + 5): miclist = [] ticlist = [] for _ in range(10): #if True: colpart = random.sample(collist, 10000) #colpart = collist x = np.array([a[i+1] for a in colpart], dtype=float) y = np.array([a[j+1] for a in colpart], dtype=float) #xy = np.array([x,y]) #mine = MINE() mine = MINE(est="mic_e") mine.compute_score(x,y) miclist.append(mine.mic()) ticlist.append(mine.tic()) #miclist = comm.gather(mine.mic(), root=0) #ticlist = comm.gather(mine.tic(), root=0) #hsic_lasso.input(xy, np.array([0,1])) #hsic_lasso.input(np.array([[1, 1, 1], [2, 2, 2]]), np.array([0, 1])) #hsic_lasso.regression(5) #hsic_lasso.classification(10) #print(hsic_lasso.dump()) if rank == root: print("%s,%s, %s, %s"%(i,j,np.mean(miclist),np.mean(ticlist)), flush = True) with open("./minedata.csv", "a") as wf: wf.write("%s, %s, %s, %s\n"%(i,j,np.mean(miclist),np.mean(ticlist)))
def MIC(features, labels): mine = MINE() mic_scores = [] labels = labels.flatten() for i in range(features.shape[1]): mine.compute_score(features[:, i], labels) m = mine.mic() mic_scores.append(m) return mic_scores
def execute(self, symbol): """ :param symbol: the symbol in which we are looking for correlations :type symbol: :class:`netzob.Common.Models.Vocabulary.AbstractField.AbstractField` """ (attributeValues_headers, attributeValues) = self._generateAttributeValuesForSymbol(symbol) symbolResults = [] # MINE computation of each field's combination for i, values_x in enumerate(attributeValues[:-1]): for j, values_y in enumerate(attributeValues[i + 1 :]): mine = MINE(alpha=0.6, c=15) mine.compute_score(numpy.array(values_x), numpy.array(values_y)) mic = round(mine.mic(), 2) if mic > float(self.minMic): # We add the relation to the results (x_fields, x_attribute) = attributeValues_headers[i] (y_fields, y_attribute) = attributeValues_headers[j] # The relation should not apply on the same field if len(x_fields) == 1 and len(y_fields) == 1 and x_fields[0].id == y_fields[0].id: continue pearson = numpy.corrcoef(values_x, values_y)[0, 1] if not numpy.isnan(pearson): pearson = round(pearson, 2) relation_type = self._findRelationType(x_attribute, y_attribute) self._debug_mine_stats(mine) self._logger.debug( "Correlation found between '" + str(x_fields) + ":" + x_attribute + "' and '" + str(y_fields) + ":" + y_attribute + "'" ) self._logger.debug(" MIC score: " + str(mic)) self._logger.debug(" Pearson score: " + str(pearson)) id_relation = str(uuid.uuid4()) symbolResults.append( { "id": id_relation, "relation_type": relation_type, "x_fields": x_fields, "x_attribute": x_attribute, "y_fields": y_fields, "y_attribute": y_attribute, "mic": mic, "pearson": pearson, } ) return symbolResults
def mic (X, Y): new_X , new_Y = remove_pairs_with_a_missing(X, Y) try: import minepy from minepy import MINE except (ImportError): sys.exit("CRITICAL ERROR:2 Unable to import minepy package." + " Please check your install.") mine = MINE(alpha=0.6, c=15) mine.compute_score(new_X , new_Y) return mine.mic(), None
def mine_features(data,features): print '...' for X_hat_idx in features: features.remove(X_hat_idx) subset = features for xi_idx in subset: m = MINE() X_hat = data[X_hat_idx].values xi = data[xi_idx].values m.compute_score(X_hat,xi) I_X_hat_xi = m.mic() if I_X_hat_xi>0.10: print 'I({X_hat_idx},{xi_idx}): {I_X_hat_xi}'.format(X_hat_idx=X_hat_idx,xi_idx=xi_idx,I_X_hat_xi=I_X_hat_xi)
def calcMICReg(df,target,col): """ """ m=MINE() if df[col].dtype.name=="category": g=df.groupby(by=[col])['_target_variable_'].mean() g=g.to_dict() X=df[col].values X=[g[x] for x in X] else: X=df[col].values m.compute_score(X, target) return {col:m.mic()}
def mysubplot(x, y, numRows, numCols, plotNum, xlim=(-4, 4), ylim=(-4, 4)): r = np.around(np.corrcoef(x, y)[0, 1], 1) mine = MINE(alpha=0.6, c=15) mine.compute_score(x, y) mic = np.around(mine.mic(), 1) ax = plt.subplot(numRows, numCols, plotNum, xlim=xlim, ylim=ylim) ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10) ax.set_frame_on(False) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) ax.plot(x, y, ',') ax.set_xticks([]) ax.set_yticks([]) return ax
def perform_mic_1p(p_sequences, p, cutoff=0.5, out_folder=''): p_sequences_t = transpose(array([list(z) for z in p_sequences])).tolist() mic_scores = [] for counter1 in range(0, len(p_sequences_t) - 1): for counter2 in range(counter1 + 1, len(p_sequences_t)): mine = MINE(alpha=0.6, c=15) mine.compute_score(p_sequences_t[counter1], p_sequences_t[counter2]) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p+'_'+str(counter1+1) mic_score['y'] = p+'_'+str(counter2+1) mic_score['p1'] = p mic_score['p2'] = p mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) write_mics_to_csv(mics=mic_scores, p1=p, p2=p, cutoff=cutoff, out_folder=out_folder) return mic_scores
def select_feature(self, data, label, threshold=0.7): """ Perform feature selection by maximum information coefficient that can capture both linear and non-linear relationships. """ selected = [] from minepy import MINE mine = MINE() for i, col in enumerate(data): print 'feature selection: %d/%d %s' % (i, data.shape[1], col) mine.compute_score(data[col], label) if mine.mic() > threshold: selected.append(col) print '%d out of %d features were selected' % (len(selected), data.shape[1]) return selected
def get_corrcoef(X): div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0) for train, test in div: X = X[np.array(test)] break X = X.transpose() pcc = np.ones((X.shape[0], X.shape[0])) m = MINE() # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6], # list(range(11, 24)), list(range(24, 29)), list(range(29, 34))] t = time() for i in range(0, 1): for j in range(1, 20): m.compute_score(X[i], X[j]) pcc[i, j] = pcc[j, i] = m.mic() # np.corrcoef(X[i], X[j])[0, 1] print(i, j, pcc[i, j], time()-t) np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',') print('Done with computing PCC,', 'using', time()-t, 's')
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5): mic_scores = [] p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist() p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist() for idx1, record1 in enumerate(p1_sequences_t): for idx2, record2 in enumerate(p2_sequences_t): mine = MINE(alpha=0.6, c=15) mine.compute_score(record1, record2) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p1+'_'+str(idx1+1) mic_score['y'] = p2+'_'+str(idx2+1) mic_score['p1'] = p1 mic_score['p2'] = p2 mic_score['weight'] = mine.mic() mic_scores.append(mic_score) #print('computed ', len(mic_scores), ' mics for ', p1, p2, 'for cutoff ', cutoff) return mic_scores
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5): mic_scores = [] p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist() p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist() for idx1, record1 in enumerate(p1_sequences_t): for idx2, record2 in enumerate(p2_sequences_t): mine = MINE(alpha=0.6, c=15) mine.compute_score(record1, record2) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p1+'_'+str(idx1+1) mic_score['y'] = p2+'_'+str(idx2+1) mic_score['p1'] = p1 mic_score['p2'] = p2 mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) write_mics_to_csv(mics=mic_scores, p1=p1, p2=p2, cutoff=cutoff) return mic_scores
def fit(self,X,y): # initialize phi and feature set # if number of features is not set, half of the features will be selected n = self.n beta = self.beta verbose = self.verbose if n ==None: n = int(X.shape[0]/2) features = np.arange(X.shape[1]).tolist() best_mi = -np.inf X_hat = 0 for xi in features: m = MINE() m.compute_score(X[:,xi],y) #compute I(xi,y) and get max xi mi_xi_y = m.mic() if best_mi<mi_xi_y: X_hat = xi phi = [X_hat] features.remove(X_hat) # get paris for elements in phi and features while len(phi)<n: mi_scores = np.zeros(len(features)) for xi_idx,xi in enumerate(features): m = MINE() m.compute_score(X[:,xi],y) #compute I(xi,y) mi_xi_y = m.mic() sum_mi_xi_xj = 0 for xj in phi: # compute I(xi,xj) and save for further evaluation m = MINE() m.compute_score(X[:,xi],X[:,xj]) mi_xi_xj = m.mic() sum_mi_xi_xj+=mi_xi_xj mi_scores[xi_idx] = mi_xi_y - beta*sum_mi_xi_xj if verbose>=2: print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(xi=xi,xj=xj,mi_scores=mi_scores[xi_idx]) X_hat = np.argmax(mi_scores) if verbose==1: print "X_hat is {X_hat}".format(X_hat=X_hat) X_hat = features[X_hat] phi.append(X_hat) features.remove(X_hat) self.phi = phi self.features = features
def get_mic(self): m = MINE() m.compute_score(self.x, self.y) return m.mic()
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) #RandomForestRegressor rf = RandomForestRegressor() rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) #f_regression f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) #MINE mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, names) #----statistics--out--------- r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean")
def train_and_analyse(_X, _y, features): X = _X Y = _y cv_l = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True, random_state=1) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) ridge = RidgeCV(cv=cv_l) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features) rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features) rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1) rf = RandomForestRegressor(n_estimators=500) rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, features) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, features) r = {} for name in features: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") ranks = pd.DataFrame(ranks) selection_feature = ranks[ranks.Mean > 0.12].index.values return ranks, selection_feature
f2=f_input2.T Mdim=len(f1) Mat=np.zeros((Mdim, Mdim)) # ============================================================================= # # Compute MIC, PCC, KTau, NMIS Algorithm # & Generate Correlation Matrix # # ============================================================================= print 'Computing mutual information indices and generating correlation matrix...' for i in range(Mdim): for j in range(Mdim): if mla == 'MIC': mine.compute_score(f1[i],f2[j]) Mat[i][j] = mine.mic() elif mla == 'PCC': Mat[i][j] = pearsonr(f1[i],f2[j])[0] elif mla == 'KTau': Mat[i][j] = kendalltau(f1[i], f2[j])[0] elif mla == 'NMIS': Mat[i][j] = normalized_mutual_info_score(f1[i],f2[j]) sys.stdout.write(".") g=open(output_dir+'/'+'CorrMatrix_'+mla+'_'+str(Mdim)+'_'+str(GPS)+'_'+nfilename+'.txt','a') if j==Mdim-1: g.write(str(Mat[i][j])) g.write('\n') else: g.write(str(Mat[i][j])) g.write(' ')
class TestFunctions(unittest.TestCase): def setUp(self): self.mine = MINE(alpha=0.6, c=15) def build_const(self, n): x = np.linspace(0, 1, n) y = np.zeros(n) return x, y def build_linear(self, n): x = np.linspace(0, 1, n) return x, x def build_sine(self, n): x = np.linspace(0, 1, n) return x, np.sin(8*np.pi*x) def build_exp(self, n): x = np.linspace(0, 10, n) return x, 2**x def test_const(self): x, y = self.build_const(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 0., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 0., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4) def test_linear(self): x, y = self.build_linear(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4) def test_linear(self): x, y = self.build_linear(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4) def test_sine(self): x, y = self.build_sine(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0.875, 3) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 4., 4) assert_almost_equal(self.mine.mcn_general(), 4., 4) def test_exp(self): x, y = self.build_exp(1000) self.mine.compute_score(x, y) assert_almost_equal(self.mine.mic(), 1., 4) assert_almost_equal(self.mine.mas(), 0., 4) assert_almost_equal(self.mine.mev(), 1., 4) assert_almost_equal(self.mine.mcn(), 2., 4) assert_almost_equal(self.mine.mcn_general(), 2., 4)
for j, f in enumerate(ff): print "Noise: %d, function: %d" % (i, j) mic_approx_null, mic_e_null, tic_e_null, r2_null = [], [], [], [] mic_approx_alt, mic_e_alt, tic_e_alt, r2_alt = [], [], [], [] # null hypothesis for k in range(1, n_null+1): x = np.random.rand(n) r = np.random.randn(n) y = f() # resimulate x for the null scenario x = np.random.rand(n) mine_approx.compute_score(x, y) mine_e.compute_score(x, y) mic_approx_null.append(mine_approx.mic()) mic_e_null.append(mine_e.mic()) tic_e_null.append(mine_e.tic()) r2_null.append(np.corrcoef(x, y)[0][1]**2) # alternative hypothesis for k in range(1, n_alt+1): x = np.random.rand(n) r = np.random.randn(n) y = f() mine_approx.compute_score(x, y) mine_e.compute_score(x, y)
def train_and_analyse(_X, _y, sno, ino): X = _X.copy() Y = _y features = X.columns.values cv_l = cross_validation.KFold(X.shape[0], n_folds=5, shuffle=True, random_state=1) ranks_linear = {} ranks_nonlinear= {} ranks_path = {} ranks = {} selection_feature = [] time_feature_1 = [ 'date2j' ] time_feature_2 = [ 'day', 'month', 'year' ] time_feature_3 = [ 'is_2012', 'is_2013', 'is_2014', 'fall', 'winter', 'spring', 'summer' ] time_feature_4 = [ 'weekday', 'is_weekend', 'is_holiday', 'is_holiday_weekday', 'is_holiday_weekend', ] time_feature_5 = [ 'MemorialDay', 'MothersDay', 'BlackFridayM3', 'BlackFriday1', 'NewYearsDay', 'IndependenceDay', 'VeteransDay', 'BlackFriday2', 'NewYearsEve', 'BlackFriday3', 'ChristmasDay', 'BlackFridayM2', 'ThanksgivingDay', 'Halloween', 'EasterSunday', 'ChristmasEve', 'ValentinesDay', 'PresidentsDay', 'ColumbusDay', 'MartinLutherKingDay', 'LaborDay', 'FathersDay', 'BlackFriday' ] weather_feature = [ 'high_precip', 'preciptotal', 'snowfall', 'high_snow', 'avgspeed', 'windy', 'temp_missing', 'tavg', 'hot', 'cold', 'frigid', 'thunder', 'snowcode', 'raincode' ] temp = time_feature_1 + time_feature_2 + time_feature_3 + time_feature_4 + time_feature_5 X_f1 = X[temp].values # lr = LinearRegression(normalize=True) # lr.fit(X, Y) # ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) f, pval = f_regression(ut.get_processed_X_A(X_f1), Y, center=True) ranks["F_regr"] = pd.Series(rank_to_dict(np.nan_to_num(f), temp)) # print('asd') # mi = mutual_info_regression(ut.get_processed_X_A(X_f1), Y) # mi /= np.max(mi) # ranks['MI'] = Pd.Series() mine = MINE() mic_scores = [] for i in range(ut.get_processed_X_A(X_f1).shape[1]): mine.compute_score(ut.get_processed_X_A(X_f1)[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = pd.Series(rank_to_dict(mic_scores, temp)) # ridge.fit(X, Y) # ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model # rlasso = RandomizedLasso(alpha='bic', normalize=True) # rlasso.fit(X_f1, Y) # ranks_linear["Stability"] = pd.Series(rlasso.scores_) # alpha_grid, scores_path = lasso_stability_path(X_f1, Y, random_state=42, # eps=0.00005, n_grid=500) # for alpha, score in zip(alpha_grid, scores_path.T): # ranks_path[alpha] = score # ranks_path = pd.DataFrame(ranks_path).transpose() # ranks_path.columns = temp # plt.figure() # ranks_path.plot() # plt.show() # selection_feature.extend(ranks_linear[ranks_linear.F_regr > 0.1].index.values.tolist()) # selection_feature.extend(ranks_linear[ranks_linear.MIC > 0.1].index.values.tolist()) # selection_feature.extend(ranks_linear[ranks_linear.Stability > 0.1].index.values.tolist()) #------------------------------- # rf = RandomForestRegressor(n_estimators=150, max_depth=4, n_jobs=4, random_state=1) rf = ut.get_regression_model('RandomForest', 0) scores = [] for i in range(X_f1.shape[1]): score = cross_val_score(rf, X_f1[:, i:i+1].astype(float), Y, scoring="r2", cv=ShuffleSplit(len(X_f1), 3, .3), n_jobs=2) scores.append(round(np.mean(score), 3)) ranks['RF'] = pd.Series(rank_to_dict(np.abs(scores), temp)) ranks = pd.DataFrame(ranks) print(ranks) selection_feature.extend(ranks[ranks.RF > 0.1].index.values.tolist()) selection_feature.extend(ranks[ranks.MIC >= 0.1].index.values.tolist()) selection_feature.extend(ranks[ranks.F_regr >= 0.1].index.values.tolist()) #------------------------------- selection_feature = list(set(selection_feature)) print(selection_feature) # ridge = RidgeCV(cv=cv_l) # rfe = RFE(ridge, n_features_to_select=1) # rfe.fit(X[selection_feature],Y) # ranks["RFE"] = pd.Series(rank_to_dict(np.array(rfe.ranking_).astype(float), selection_feature, order=1)) # ranks = pd.DataFrame(ranks) # print(ranks) # r = {} # for name in features: # r[name] = round(np.mean([ranks[method][name] # for method in ranks.keys()]), 2) # methods = sorted(ranks.keys()) # ranks["Mean"] = r # methods.append("Mean") path = 'Analyse/store_{}/'.format(sno) mkdir_p(path) path += 'item_{}_(pair_analyse)'.format(ino) ranks.to_pickle(path) path += '.png' p.clf() p.cla() plt.figure(figsize=(16, 26)) ranks.plot.barh(stacked=True) p.savefig(path, bbox_inches='tight', dpi=300) plt.close() return ranks, selection_feature
def mic(x, y): m = MINE() m.compute_score(x, y) return (m.mic(), 0.5)
# for i in [7,9]: # mine.compute_score(X_Standard_T[i], X_Standard_T[10]) # mics.append(mine.mic()) # print i, mine.mic() # # for i in range(0,38): # # mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[38]) # # mics.append(mine.mic()) # # print i, mine.mic() # for i in range(0,7): # mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[7]) # mics.append(mine.mic()) # print i, mine.mic() # for i in range(48): mine.compute_score(X_ALL_Standard_T[i], X_ALL_Standard_T[48]) mics.append(mine.mic()) names = [] for c in allDF.columns.values: names.append(c) map = {} for i in range(48): map[names[i]] = mics[i] import operator sorted_tuple = sorted(map.items(), key=operator.itemgetter(1)) vs = [] ks = [] for k,v in sorted_tuple:
res = [] res.append(pd.read_csv("./avg_xgbs_discret_feature_5.csv").score.values) res.append(pd.read_csv("./R_7199.csv").score.values) res.append(pd.read_csv("./rank_feature_xgb_ensemble.csv").score.values) res.append(pd.read_csv("./avg_xgbs_discret_feature_10.csv").score.values) res.append(pd.read_csv("./based_on_select_rank_feature.csv").score.values) res.append(pd.read_csv("./xgb717.csv").score.values) res.append(pd.read_csv("./725.csv").score.values) res.append(pd.read_csv("./svm6938.csv").score.values) cm = [] for i in range(8): tmp = [] for j in range(8): m = MINE() m.compute_score(res[i], res[j]) tmp.append(m.mic()) cm.append(tmp) import numpy as np import matplotlib.pyplot as plt def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(8) plt.xticks(tick_marks, fs, rotation=45) plt.yticks(tick_marks, fs)
def mic(x, y): m = MINE() print x print y m.compute_score(x, y) return (m.mic(), 0.5)
for i in range(1, n_noise+1): for j, f in enumerate(ff): mic_null, gmic_null, r2_null = [], [], [] mic_alt, gmic_alt, r2_alt = [], [], [] # null hypothesis for k in range(1, n_null+1): print i, j, k x = np.random.rand(n) r = np.random.randn(n) y = f() # resimulate x for the null scenario x = np.random.rand(n) mine.compute_score(x, y) mic_null.append(mine.mic()) gmic_null.append(mine.gmic(p=-1)) r2_null.append(np.corrcoef(x, y)[0][1]**2) # alternative hypothesis for k in range(1, n_alt+1): x = np.random.rand(n) r = np.random.randn(n) y = f() mine.compute_score(x, y) mic_alt.append(mine.mic()) gmic_alt.append(mine.gmic(p=-1)) r2_alt.append(np.corrcoef(x, y)[0][1]**2)
print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size)) print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size)) #明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0 a = np.random.uniform(-1, 1, 100000) #uniform(low,high,size) 随机数 print pearsonr(a, a**2)[0] #1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1] #互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式, #然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。 from minepy import MINE # m = MINE() x = np.random.uniform(-1, 1, 10000) m.compute_score(x, x**2) print m.mic() #1.3 距离相关系数 (Distance correlation),[0,1] #距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中,即便Pearson相关系数是0, #我们也不能断定这两个变量是独立的(有可能是非线性相关);但如果距离相关系数是0,那么我们就可以说这两个变量是独立的。 import numpy as np def dist(x, y): #1d only return np.abs(x[:, None] - y) def d_n(x): d = dist(x, x)
def doMICAnalysisOfInputVariables(inArr, targetArr,targetName, mic_score_threshold,input_indexes_uncorrelated_features,targetQualityMap = None): #if(targetQuality == None): # return inArr #print inArr #global inputColumnNameToIndexMapFromFile #global measuredColumnNameToIndexMapFromFile #global outputColumnNameToIndexMapFromFile #print "\n\n\n doMICAnalysisOfInputVariables called \n\n" goodTargetMap = getGlobalObject("goodTargetMap") selected_inArr = [] selected_inArr_indexes = [] selected_originalColumn_indexes = [] inColMap = getGlobalObject("inputColumnIndexToNameMapFromFile") #keys are col index and vals are names #selected_inArr.append([]) #print "doMICAnalysisOfInputVariables: ", "inArr.shape: ", inArr.shape #print "doMICAnalysisOfInputVariables: ", "targetArr.shape: ", targetArr.shape numOfFeatures = 0 try: #(rows,numOfFeatures) = inArr.shape numOfFeatures = inArr.shape[1] except: print "ERROR: \n", inArr exit(0) k = 0 for featureIndex in range(numOfFeatures): #for i in inColMap.keys(): #x = inArr[:,i] #x = inArr[:,k] # we will choose only uncorrelated features as input if(featureIndex not in input_indexes_uncorrelated_features): continue x = inArr[:,featureIndex] #print "x: ", x x_scaled = preprocessing.scale(x) #print "x: ", x_scaled #print "targetArr: ", targetArr mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, targetArr) #print getGlobalObject("inputColumnNameToIndexMapFromFile") #inputFeatureName = getGlobalObject("inputColumnNameToIndexMapFromFile")[i] #inputFeatureName = inColMap[i] #inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex) #inputFeatureName = getInputParameterNameFromColumnIndex(featureIndex) inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex) print_stats(mine,inputFeatureName,targetName,mic_score_threshold) if(targetQualityMap != None): targetQualityMap.append(float(mine.mic())) #l = list(x) #selected_inArr = np.concatenate((selected_inArr, np.array(l)), axis=0) #print k #print mine.mic() if(float(mine.mic()) >= mic_score_threshold): selected_inArr.append(x) #keep the input data column selected_inArr_indexes.append(k) #keep the index corresponding to that column colIdx = getColumnIndexFromFeatureIndex(featureIndex) selected_originalColumn_indexes.append(colIdx) #keep the original column index corresponding to that column #now add the target itself to goodTargetMap. For anomaly detection we will only use these targets goodTargetMap[targetName] = True print "----------------- selected: ", inputFeatureName, colIdx, k k = k + 1 selected_inArr = np.array(selected_inArr).transpose() #print "\n **** selected: ==== \n", selected_inArr, selected_inArr_indexes,selected_originalColumn_indexes return selected_inArr, selected_inArr_indexes, selected_originalColumn_indexes
def interactionV(self, data): from minepy import MINE m = MINE() m.compute_score(data, x**2) print(m.mic())
def performMIC(transposed_list): mic_scores=[] for counter1 in range(0, len(transposed_list)-1): for counter2 in range(counter1+1, len(transposed_list)): mine = MINE(alpha=0.6, c=15) mine.compute_score(transposed_list[counter1], transposed_list[counter2]) if (mine.mic()>0.6): mic_score={} mic_score['x']=counter1 mic_score['y']=counter2 mic_score['mic']=mine.mic() mic_scores.append(mic_score) return mic_scores