def chooseIndependantInputVariables(inArr): #print inArr selected_input_indexes = [] for i in range(inArr.shape[1]): doSelect = True for j in range(i): #Subrata for now choosing all inputs! commentout "break" later when you need it. #break # comment out this to select only independant inputs if(i == j): return x = inArr[:,i] y = inArr[:,j] #inputFeatureName1 = getInputParameterNameFromColumnIndex(i) inputFeatureName1 = getInputParameterNameFromFeatureIndex(i) #inputFeatureName2 = getInputParameterNameFromColumnIndex(j) inputFeatureName2 = getInputParameterNameFromFeatureIndex(j) #print "x: ", x x_scaled = preprocessing.scale(x) y_scaled = preprocessing.scale(y) #print "x: ", x_scaled #print "targetArr: ", targetArr mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, y_scaled) print "Correlation between ",inputFeatureName1,inputFeatureName2, " is ", mine.mic() if(float(mine.mic()) >= 0.99): doSelect = False print "\n ***** ==> will NOT select ", inputFeatureName1, " as it correlates with ", inputFeatureName2, "\n" #end for if(doSelect): selected_input_indexes.append(i) return selected_input_indexes
def calculateCorrelationBetweenVectors(x,y): #x = scipy.array([-0.65499887, 2.34644428, 3.0]) #y = scipy.array([-1.46049758, 3.86537321, 21.0]) #The Pearson correlation coefficient measures the linear relationship between two datasets. #Strictly speaking, Pearson correlation requires that each dataset be normally distributed. #correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. #Correlations of -1 or +1 imply an exact linear relationship. #The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. #The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. #print "X = " , x, "\nY = ", y #corr, p_value = pearsonr(x, y) commonSize = 0 if(len(x) < len(y)): commonSize = len(x) else: commonSize = len(y) x_sorted = np.sort(x) y_sorted = np.sort(y) x_sorted = x_sorted[ : (commonSize - 1)] y_sorted = y_sorted[ : (commonSize - 1)] x_scaled = preprocessing.scale(x_sorted) y_scaled = preprocessing.scale(y_sorted) mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, y_scaled) corr = float(mine.mic()) #return #print "correlation :", corr return corr
def _evaluate_single(data, target_feature): mine = MINE(alpha=0.4, c=15) MICs = list() for i in range(data.shape[1]): mine.compute_score(target_feature,data[:,i]) MICs.append(mine.mic()) return(MICs)
def execute(self, symbol): """ :param symbol: the symbol in which we are looking for correlations :type symbol: :class:`netzob.Common.Models.Vocabulary.AbstractField.AbstractField` """ (attributeValues_headers, attributeValues) = self._generateAttributeValuesForSymbol(symbol) symbolResults = [] # MINE computation of each field's combination for i, values_x in enumerate(attributeValues[:-1]): for j, values_y in enumerate(attributeValues[i + 1 :]): mine = MINE(alpha=0.6, c=15) mine.compute_score(numpy.array(values_x), numpy.array(values_y)) mic = round(mine.mic(), 2) if mic > float(self.minMic): # We add the relation to the results (x_fields, x_attribute) = attributeValues_headers[i] (y_fields, y_attribute) = attributeValues_headers[j] # The relation should not apply on the same field if len(x_fields) == 1 and len(y_fields) == 1 and x_fields[0].id == y_fields[0].id: continue pearson = numpy.corrcoef(values_x, values_y)[0, 1] if not numpy.isnan(pearson): pearson = round(pearson, 2) relation_type = self._findRelationType(x_attribute, y_attribute) self._debug_mine_stats(mine) self._logger.debug( "Correlation found between '" + str(x_fields) + ":" + x_attribute + "' and '" + str(y_fields) + ":" + y_attribute + "'" ) self._logger.debug(" MIC score: " + str(mic)) self._logger.debug(" Pearson score: " + str(pearson)) id_relation = str(uuid.uuid4()) symbolResults.append( { "id": id_relation, "relation_type": relation_type, "x_fields": x_fields, "x_attribute": x_attribute, "y_fields": y_fields, "y_attribute": y_attribute, "mic": mic, "pearson": pearson, } ) return symbolResults
def mine_features(data,features): print '...' for X_hat_idx in features: features.remove(X_hat_idx) subset = features for xi_idx in subset: m = MINE() X_hat = data[X_hat_idx].values xi = data[xi_idx].values m.compute_score(X_hat,xi) I_X_hat_xi = m.mic() if I_X_hat_xi>0.10: print 'I({X_hat_idx},{xi_idx}): {I_X_hat_xi}'.format(X_hat_idx=X_hat_idx,xi_idx=xi_idx,I_X_hat_xi=I_X_hat_xi)
def calcMICReg(df,target,col): """ """ m=MINE() if df[col].dtype.name=="category": g=df.groupby(by=[col])['_target_variable_'].mean() g=g.to_dict() X=df[col].values X=[g[x] for x in X] else: X=df[col].values m.compute_score(X, target) return {col:m.mic()}
def perform_mic_1p(p_sequences, p, cutoff=0.5, out_folder=''): p_sequences_t = transpose(array([list(z) for z in p_sequences])).tolist() mic_scores = [] for counter1 in range(0, len(p_sequences_t) - 1): for counter2 in range(counter1 + 1, len(p_sequences_t)): mine = MINE(alpha=0.6, c=15) mine.compute_score(p_sequences_t[counter1], p_sequences_t[counter2]) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p+'_'+str(counter1+1) mic_score['y'] = p+'_'+str(counter2+1) mic_score['p1'] = p mic_score['p2'] = p mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) write_mics_to_csv(mics=mic_scores, p1=p, p2=p, cutoff=cutoff, out_folder=out_folder) return mic_scores
def mysubplot(x, y, numRows, numCols, plotNum, xlim=(-4, 4), ylim=(-4, 4)): r = np.around(np.corrcoef(x, y)[0, 1], 1) mine = MINE(alpha=0.6, c=15) mine.compute_score(x, y) mic = np.around(mine.mic(), 1) ax = plt.subplot(numRows, numCols, plotNum, xlim=xlim, ylim=ylim) ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10) ax.set_frame_on(False) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) ax.plot(x, y, ',') ax.set_xticks([]) ax.set_yticks([]) return ax
def select_feature(self, data, label, threshold=0.7): """ Perform feature selection by maximum information coefficient that can capture both linear and non-linear relationships. """ selected = [] from minepy import MINE mine = MINE() for i, col in enumerate(data): print 'feature selection: %d/%d %s' % (i, data.shape[1], col) mine.compute_score(data[col], label) if mine.mic() > threshold: selected.append(col) print '%d out of %d features were selected' % (len(selected), data.shape[1]) return selected
def get_corrcoef(X): div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0) for train, test in div: X = X[np.array(test)] break X = X.transpose() pcc = np.ones((X.shape[0], X.shape[0])) m = MINE() # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6], # list(range(11, 24)), list(range(24, 29)), list(range(29, 34))] t = time() for i in range(0, 1): for j in range(1, 20): m.compute_score(X[i], X[j]) pcc[i, j] = pcc[j, i] = m.mic() # np.corrcoef(X[i], X[j])[0, 1] print(i, j, pcc[i, j], time()-t) np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',') print('Done with computing PCC,', 'using', time()-t, 's')
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5): mic_scores = [] p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist() p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist() for idx1, record1 in enumerate(p1_sequences_t): for idx2, record2 in enumerate(p2_sequences_t): mine = MINE(alpha=0.6, c=15) mine.compute_score(record1, record2) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p1+'_'+str(idx1+1) mic_score['y'] = p2+'_'+str(idx2+1) mic_score['p1'] = p1 mic_score['p2'] = p2 mic_score['weight'] = mine.mic() mic_scores.append(mic_score) #print('computed ', len(mic_scores), ' mics for ', p1, p2, 'for cutoff ', cutoff) return mic_scores
def perform_mic_2p(p1_sequences, p2_sequences, p1, p2, cutoff=0.5): mic_scores = [] p1_sequences_t = transpose(array([list(z) for z in p1_sequences])).tolist() p2_sequences_t = transpose(array([list(z) for z in p2_sequences])).tolist() for idx1, record1 in enumerate(p1_sequences_t): for idx2, record2 in enumerate(p2_sequences_t): mine = MINE(alpha=0.6, c=15) mine.compute_score(record1, record2) if (mine.mic() > float(cutoff)): mic_score = {} mic_score['x'] = p1+'_'+str(idx1+1) mic_score['y'] = p2+'_'+str(idx2+1) mic_score['p1'] = p1 mic_score['p2'] = p2 mic_score['weight'] = format(mine.mic(), '.3f') mic_scores.append(mic_score) write_mics_to_csv(mics=mic_scores, p1=p1, p2=p2, cutoff=cutoff) return mic_scores
def mutual_information(self, X, Y, title=None, nbins_X=50, nbins_Y=50, noise_sigma='all'): #import pdb; pdb.set_trace() no_nans_idx = np.logical_not(np.logical_or(np.isnan(X), np.isnan(Y))) Xq, _, _ = pyentropy.quantise(X[no_nans_idx], nbins_X) Yq, _, _ = pyentropy.quantise(Y[no_nans_idx], nbins_Y) s = pyentropy.DiscreteSystem(Yq, (1, nbins_Y), Xq, (1, nbins_X)) s.calculate_entropies() # MINE mine = MINE() mine.compute_score(X.flatten(), Y.flatten()) # Linear regression slope, intercept, r, p, stderr = \ scipy.stats.linregress(X[no_nans_idx], Y[no_nans_idx]) #import pdb; pdb.set_trace() if title is not None: print(title) print(" MIC/MI/r^2/p/slope for %s:\t%.3f\t%.3f\t%s\t%s\t%s" % (noise_sigma, mine.mic(), s.I(), r**2, p, slope))
def run_feature_selection(): X, Y = get_dataset() X = np.array(X) Y = np.array(Y) # print len(X[0]) # names = ["x%s" % i for i in range(1, 8)] names = [ 'Age', 'Sex', 'Sleep quality', 'Sleep latency', 'Sleep time', 'Sleep efficiency', 'Sleep disorder', 'Hypnagogue', 'Daytime dyfunction' ] # names = ['Sex', 'Sleep quality', 'Sleep latency', 'Sleep time', 'Sleep efficiency', 'Sleep disorder', 'Hypnagogue', 'Daytime dyfunction'] lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) # stop the search when 5 features are left (they will get equal scores) rfe = RFE(ridge, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) # rf = RandomForestRegressor() # rf.fit(X, Y) # ranks["RF"] = rank_to_dict(rf.feature_importances_, names) # f, pval = f_regression(X, Y, center=True) # # print len(f),len(names) # ranks["Corr."] = rank_to_dict(f, names) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, names) r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print("\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods])))) return ranks
def MIC(data): mine = MINE(alpha=0.6, c=15) #alpha:网格分辨率限制,m*n<B,B=n^alpha data_mic = MIC_matirx(data, mine) print(data_mic) return data_mic
def mic(x, y): m = MINE() print x print y m.compute_score(x, y) return (m.mic(), 0.5)
def interactionV(self, data): from minepy import MINE m = MINE() m.compute_score(data, x**2) print(m.mic())
from minepy import MINE import numpy as np m = MINE() x = np.random.uniform(-1, 1, 10000) m.compute_score(x, x ** 2) print(m.mic()) # 1.0000000000000009 import numpy as np from scipy.stats import pearsonr np.random.seed(0) size = 300 x = np.random.normal(0, 1, size) print("Lower noise", pearsonr(x, x + np.random.normal(0, 1, size))) print("Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))) ''' Lower noise (0.7182483686213841, 7.32401731299835e-49) Higher noise (0.057964292079338155, 0.3170099388532475) '''
def get_mic(self): m = MINE() m.compute_score(self.x, self.y) return m.mic()
def micCompute(x, y): m = MINE() m.compute_score(x, y) return m.mic()
def calculate_mic(self, x, y): mine = MINE() mine.compute_score(x, y) score = mine.mic() / len(np.unique(x)) return score
#!/usr/bin/env python from datetime import datetime from datetime import timedelta import numpy as np from minepy import MINE from base import N_SERIES_DAYS, DECAY_RATE, TIME_FORMAT from database import connect from extract import getBorder, get_n_days, get_n_artists, get_n_series, getSeries, getSeriesRange _mine = MINE() def genFeatureDefination(name): db = connect() cursor = db.cursor() sql = 'alter table mars_tianchi_features drop column %s' % name try: cursor.execute(sql) except Exception, e: print 'ignore drop column error !!!' sql = 'alter table mars_tianchi_features add column (%s float)' % name cursor.execute(sql) beginXTrain = getBorder(isBegin=True, isX=True, isTrain=True) endXTrain = getBorder(isBegin=False, isX=True, isTrain=True) beginXTest = getBorder(isBegin=True, isX=True, isTrain=False) endXTest = getBorder(isBegin=False, isX=True, isTrain=False) beginYTrain = getBorder(isBegin=True, isX=False, isTrain=True) endYTrain = getBorder(isBegin=False, isX=False, isTrain=True) n_X_days = get_n_days(isX=True, isTrain=True)
# The first line is description and should be deleted. data = np.delete(data, 0, axis=0) # total_heat = data[:, 908] total_elec = data[:, 907] # heat_areas = data[:, 829] # consumption per area as output Y # Y = np.divide(total_heat, heat_areas) Y = total_elec # print min(Y) # print min(heat_areas) # print data[0, 908], data[2, 829] index = [] fo = open('MIC_results.txt', 'w') for i in range(1, len(data[0])): X = data[:, i] mine = MINE(alpha=0.6, c=15) mine.compute_score(X, Y) # index.append(mine.mic()) fo.write(str(mine.mic())+'/n') print (i+1, mine.mic()) fo.close()
# -*- coding: utf-8 -*- """ Created on Thu Apr 4 18:17:29 2019 @author: Chiaki """ import sys sys.path.append("D:\Python\current") import numpy as np import pandas as pd import copy as npcopy from minepy import MINE mine = MINE(alpha=0.6, c=15, est='mic_approx') import time class MyPSO(object): """ pop_size:鸟群规模 factor_size:解的维度 wmax, wmin :惯性权值 w 的取值范围 c1, c2:学习参数 iter:最大迭代次数 """ def __init__(self,pop_size,factor_size,wmax,wmin,c1,c2,iter,data): self.pop_size = pop_size self.factor_size = factor_size self.wmax = wmax self.wmin = wmin self.c1 = c1
# 金融指标及时间序列 # ============================================================== import talib import arch import statsmodels import patsy # 机器学习、深度学习 # ============================================================== # pip install minepy from minepy import MINE # 计算相关性 import numpy as np import numexpr m = MINE() x = np.random.uniform(-1, 1, 10000) m.compute_score(x, x**2) #print m.mic() import pybrain import sklearn # pip install heampy import heamy # 用于融合模型(与sklearn联合使用) # pip install lightgbm import lightgbm import xgboost
#stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) #RandomForestRegressor rf = RandomForestRegressor() rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) #f_regression f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) #MINE mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, names) #----statistics--out--------- r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys())
xgb_7844 = pd.read_csv('xgb_7844.csv') svm_771 = pd.read_csv('svm_771.csv') xgb_787 = pd.read_csv('xgb_787.csv') fs = ['xgb_7844', 'svm_771', 'xgb_787'] res = [] res.append(pd.read_csv('xgb_7844.csv').score.values) res.append(pd.read_csv('svm_771.csv').score.values) res.append(pd.read_csv('xgb_787.csv').score.values) cm = [] for i in range(3): tmp = [] for j in range(3): m = MINE() m.compute_score(res[i], res[j]) tmp.append(m.mic()) cm.append(tmp) import numpy as np import matplotlib.pyplot as plt def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(3) plt.xticks(tick_marks, fs, rotation=45) plt.yticks(tick_marks, fs)
def train_and_analyse(_X, _y, features): X = _X Y = _y cv_l = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True, random_state=1) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) ridge = RidgeCV(cv=cv_l) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features) rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features) rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1) rf = RandomForestRegressor(n_estimators=500) rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, features) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, features) r = {} for name in features: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") ranks = pd.DataFrame(ranks) selection_feature = ranks[ranks.Mean > 0.12].index.values return ranks, selection_feature
def MIC(a, b): # return the MIC between a and b mine = MINE() mine.compute_score(a, b) MIC = mine.mic() # print('MIC=',MIC) return MIC
def train_and_analyse(_X, _y, sno, ino): X = _X.copy() Y = _y features = X.columns.values cv_l = cross_validation.KFold(X.shape[0], n_folds=5, shuffle=True, random_state=1) ranks_linear = {} ranks_nonlinear= {} ranks_path = {} ranks = {} selection_feature = [] time_feature_1 = [ 'date2j' ] time_feature_2 = [ 'day', 'month', 'year' ] time_feature_3 = [ 'is_2012', 'is_2013', 'is_2014', 'fall', 'winter', 'spring', 'summer' ] time_feature_4 = [ 'weekday', 'is_weekend', 'is_holiday', 'is_holiday_weekday', 'is_holiday_weekend', ] time_feature_5 = [ 'MemorialDay', 'MothersDay', 'BlackFridayM3', 'BlackFriday1', 'NewYearsDay', 'IndependenceDay', 'VeteransDay', 'BlackFriday2', 'NewYearsEve', 'BlackFriday3', 'ChristmasDay', 'BlackFridayM2', 'ThanksgivingDay', 'Halloween', 'EasterSunday', 'ChristmasEve', 'ValentinesDay', 'PresidentsDay', 'ColumbusDay', 'MartinLutherKingDay', 'LaborDay', 'FathersDay', 'BlackFriday' ] weather_feature = [ 'high_precip', 'preciptotal', 'snowfall', 'high_snow', 'avgspeed', 'windy', 'temp_missing', 'tavg', 'hot', 'cold', 'frigid', 'thunder', 'snowcode', 'raincode' ] temp = time_feature_1 + time_feature_2 + time_feature_3 + time_feature_4 + time_feature_5 X_f1 = X[temp].values # lr = LinearRegression(normalize=True) # lr.fit(X, Y) # ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) f, pval = f_regression(ut.get_processed_X_A(X_f1), Y, center=True) ranks["F_regr"] = pd.Series(rank_to_dict(np.nan_to_num(f), temp)) # print('asd') # mi = mutual_info_regression(ut.get_processed_X_A(X_f1), Y) # mi /= np.max(mi) # ranks['MI'] = Pd.Series() mine = MINE() mic_scores = [] for i in range(ut.get_processed_X_A(X_f1).shape[1]): mine.compute_score(ut.get_processed_X_A(X_f1)[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = pd.Series(rank_to_dict(mic_scores, temp)) # ridge.fit(X, Y) # ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model # rlasso = RandomizedLasso(alpha='bic', normalize=True) # rlasso.fit(X_f1, Y) # ranks_linear["Stability"] = pd.Series(rlasso.scores_) # alpha_grid, scores_path = lasso_stability_path(X_f1, Y, random_state=42, # eps=0.00005, n_grid=500) # for alpha, score in zip(alpha_grid, scores_path.T): # ranks_path[alpha] = score # ranks_path = pd.DataFrame(ranks_path).transpose() # ranks_path.columns = temp # plt.figure() # ranks_path.plot() # plt.show() # selection_feature.extend(ranks_linear[ranks_linear.F_regr > 0.1].index.values.tolist()) # selection_feature.extend(ranks_linear[ranks_linear.MIC > 0.1].index.values.tolist()) # selection_feature.extend(ranks_linear[ranks_linear.Stability > 0.1].index.values.tolist()) #------------------------------- # rf = RandomForestRegressor(n_estimators=150, max_depth=4, n_jobs=4, random_state=1) rf = ut.get_regression_model('RandomForest', 0) scores = [] for i in range(X_f1.shape[1]): score = cross_val_score(rf, X_f1[:, i:i+1].astype(float), Y, scoring="r2", cv=ShuffleSplit(len(X_f1), 3, .3), n_jobs=2) scores.append(round(np.mean(score), 3)) ranks['RF'] = pd.Series(rank_to_dict(np.abs(scores), temp)) ranks = pd.DataFrame(ranks) print(ranks) selection_feature.extend(ranks[ranks.RF > 0.1].index.values.tolist()) selection_feature.extend(ranks[ranks.MIC >= 0.1].index.values.tolist()) selection_feature.extend(ranks[ranks.F_regr >= 0.1].index.values.tolist()) #------------------------------- selection_feature = list(set(selection_feature)) print(selection_feature) # ridge = RidgeCV(cv=cv_l) # rfe = RFE(ridge, n_features_to_select=1) # rfe.fit(X[selection_feature],Y) # ranks["RFE"] = pd.Series(rank_to_dict(np.array(rfe.ranking_).astype(float), selection_feature, order=1)) # ranks = pd.DataFrame(ranks) # print(ranks) # r = {} # for name in features: # r[name] = round(np.mean([ranks[method][name] # for method in ranks.keys()]), 2) # methods = sorted(ranks.keys()) # ranks["Mean"] = r # methods.append("Mean") path = 'Analyse/store_{}/'.format(sno) mkdir_p(path) path += 'item_{}_(pair_analyse)'.format(ino) ranks.to_pickle(path) path += '.png' p.clf() p.cla() plt.figure(figsize=(16, 26)) ranks.plot.barh(stacked=True) p.savefig(path, bbox_inches='tight', dpi=300) plt.close() return ranks, selection_feature
# and all POS samples are placed together before the NEG ones X = np.array(df.values).T # y, len(classes) bits, # with 1 representing 'POS' and 0 representing 'NEG' y = np.array([1] * pos_num + [0] * neg_num) # data pre-processing X = preprocessing.normalize(X) # check availability of the output path if not os.path.exists(output_path): os.mkdir(output_path) # MIC calculation if ft_num_limit > 500: # cut down the feature number to below 500 by MIC calculation mine = MINE() mic_scores = [] for i in range(ft_num_limit): mine.compute_score(X[:, i], y) mic_scores.append(mine.mic()) top_fts_mic = sorted(list(zip(range(ft_num_limit), mic_scores)), key=operator.itemgetter(1), reverse=True) top_mic_pos = [x[0] for x in top_fts_mic[0:initial_ft_num]] else: top_mic_pos = list(range(initial_ft_num)) # preprocessing end, record the time cost preprocess_time = time.time() # =========================================
import pandas as pd import numpy as np from minepy import MINE df1 = pd.read_csv("/home/kei/document/experiments/ICTH2019/SY/UniSY2.csv") df2 = pd.read_csv("/home/kei/document/experiments/ICTH2019/SK/UniSK1.csv") #columns = ["LSholderX","LSholderY","LSholderZ"] columns = ["LSholderX"] X = df1[columns].values.tolist() Y = df2[columns].values.tolist() mine = MINE() print(X.shape) mine.compute_score(X, Y) print(mine.mic())
def demo8(): np.random.seed(0) size = 750 X = np.random.uniform(0, 1, (size, 14)) Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + np.random.normal(0, 1)) X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4)) names = ["x%s" % i for i in range(1, 15)] ranks = {} def rank_to_dict(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks)) lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) # # rlasso = RandomizedLasso(alpha=0.04) # rlasso.fit(X, Y) # ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) # stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, names) r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print("\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods]))))
def utilize_selection_method(self, options): self.parse_options(options) normalize_feature = self.normalize_feature(self.data_feature) feature_amount = len(self.data_feature[0]) selection_result = {} logging.info(' Supervised Feature Selection : Start') if self.options['p'] == 1: widget = ['Calculating Pearson Correlation : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() pearson_corr = [] for n in range(0, feature_amount): tmp_pearson = pearsonr(normalize_feature[:, n], self.data_label) pearson_corr.append([abs(tmp_pearson[0]), n+1]) timer.update(n) timer.finish() selection_result['pearson-correlation'] = sorted(pearson_corr, reverse=True) if self.options['r'] == 1: widget = ['Calculating Random Forest : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() rf = RandomForestRegressor(n_estimators=20, max_depth=4) #rf.fit(normalize_feature, self.data_label) #rf_feature_score = rf.feature_importances_ random_forest = [] for n in range(0, feature_amount): score = cross_val_score(rf, normalize_feature[:, n:n+1], self.data_label, scoring="r2", cv=ShuffleSplit(len(normalize_feature), 3, .3)) random_forest.append([round(np.mean(score), 3), n+1]) #random_forest.append([rf_feature_score[n], n+1]) timer.update(n) timer.finish() selection_result['random-forest'] = sorted(random_forest, reverse=True) if self.options['m'] == 1: widget = ['Calculating Mutual Information : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() mutual_information = [] mine = MINE() for n in range(0, feature_amount): mine.compute_score(normalize_feature[:, n], self.data_label) mutual_information.append([mine.mic(), n+1]) timer.update(n) timer.finish() selection_result['mutual-information'] = sorted(mutual_information, reverse=True) if self.options['c'] == 1: widget = ['Calculating Chi Squire : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() chi_squire = [] compute_chi2 = chi2(normalize_feature, self.data_label)[0] for n in range(0, feature_amount): chi_squire.append([compute_chi2[n], n+1]) timer.update(n) timer.finish() selection_result['chi-squire'] = sorted(chi_squire, reverse=False) if self.options['k'] == 1: widget = ['Calculating Kendall Correlation : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() kendall_correlation = [] for n in range(0, feature_amount): tmp_kendall = kendalltau(normalize_feature[:, n], self.data_label) kendall_correlation.append([tmp_kendall[0], n+1]) timer.update(n) timer.finish() selection_result['kendall-correlation'] = sorted(kendall_correlation, reverse=True) if self.options['s'] == 1: widget = ['Calculating Spearman Correlation : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() spearman_corr = [] for n in range(0, feature_amount): tmp_spearman = spearmanr(normalize_feature[:, n], self.data_label) spearman_corr.append([abs(tmp_spearman[0]), n+1]) timer.update(n) timer.finish() selection_result['spearman-correlation'] = sorted(spearman_corr, reverse=True) if self.options['f'] == 1: logging.info(' -----Calculating Fisher score---- ') f_score = fisher_score.fisher_score(normalize_feature, self.data_label) fisher = [] for n in range(0, feature_amount): fisher.append([f_score[n], n+1]) selection_result['fisher-score'] = sorted(fisher, reverse=True) logging.info(' -----Calculating Fisher score---- ==> Done') return selection_result
def maximal_information_coeffcient(): m = MINE() x = np.random.uniform(-1, 1, 1000) m.compute_score(x, x**2 + 2) print m.mic()
rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, names) r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys())
'-s1', '--serie1', nargs='+', help='<Required> first serie of numbers (usage: -s1 1 43 25 0)', required=True) parser.add_argument( '-s2', '--serie2', nargs='+', help='<Required> second serie of numbers (usage: -s1 1 43 25 0)', required=True) parser.add_argument( "-a", "--alpha", help="float (0,1.0] the exponent in B(n) n^alpha (default 0.6)", default="0.6") parser.add_argument( "-c", "--clumps", help= "float (> 0) determines how many more clumps there will be than columns in every partition. Default value is 15, meaning that when trying to draw x grid lines on the x-axis, the algorithm will start with at most 15*x clumps (default 15)", default="15") args = parser.parse_args() x = args.serie1 y = args.serie2 mine = MINE(alpha=float(args.alpha), c=float(args.clumps)) mine.compute_score(x, y) print_stats(mine)
def doMICAnalysisOfInputVariables(inArr, targetArr,targetName, mic_score_threshold,input_indexes_uncorrelated_features,targetQualityMap = None): #if(targetQuality == None): # return inArr #print inArr #global inputColumnNameToIndexMapFromFile #global measuredColumnNameToIndexMapFromFile #global outputColumnNameToIndexMapFromFile #print "\n\n\n doMICAnalysisOfInputVariables called \n\n" goodTargetMap = getGlobalObject("goodTargetMap") selected_inArr = [] selected_inArr_indexes = [] selected_originalColumn_indexes = [] inColMap = getGlobalObject("inputColumnIndexToNameMapFromFile") #keys are col index and vals are names #selected_inArr.append([]) #print "doMICAnalysisOfInputVariables: ", "inArr.shape: ", inArr.shape #print "doMICAnalysisOfInputVariables: ", "targetArr.shape: ", targetArr.shape numOfFeatures = 0 try: #(rows,numOfFeatures) = inArr.shape numOfFeatures = inArr.shape[1] except: print "ERROR: \n", inArr exit(0) k = 0 for featureIndex in range(numOfFeatures): #for i in inColMap.keys(): #x = inArr[:,i] #x = inArr[:,k] # we will choose only uncorrelated features as input if(featureIndex not in input_indexes_uncorrelated_features): continue x = inArr[:,featureIndex] #print "x: ", x x_scaled = preprocessing.scale(x) #print "x: ", x_scaled #print "targetArr: ", targetArr mine = MINE(alpha=0.6, c=15) mine.compute_score(x_scaled, targetArr) #print getGlobalObject("inputColumnNameToIndexMapFromFile") #inputFeatureName = getGlobalObject("inputColumnNameToIndexMapFromFile")[i] #inputFeatureName = inColMap[i] #inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex) #inputFeatureName = getInputParameterNameFromColumnIndex(featureIndex) inputFeatureName = getInputParameterNameFromFeatureIndex(featureIndex) print_stats(mine,inputFeatureName,targetName,mic_score_threshold) if(targetQualityMap != None): targetQualityMap.append(float(mine.mic())) #l = list(x) #selected_inArr = np.concatenate((selected_inArr, np.array(l)), axis=0) #print k #print mine.mic() if(float(mine.mic()) >= mic_score_threshold): selected_inArr.append(x) #keep the input data column selected_inArr_indexes.append(k) #keep the index corresponding to that column colIdx = getColumnIndexFromFeatureIndex(featureIndex) selected_originalColumn_indexes.append(colIdx) #keep the original column index corresponding to that column #now add the target itself to goodTargetMap. For anomaly detection we will only use these targets goodTargetMap[targetName] = True print "----------------- selected: ", inputFeatureName, colIdx, k k = k + 1 selected_inArr = np.array(selected_inArr).transpose() #print "\n **** selected: ==== \n", selected_inArr, selected_inArr_indexes,selected_originalColumn_indexes return selected_inArr, selected_inArr_indexes, selected_originalColumn_indexes
def score_calculate(flag): # 行为特征选择的算法,列为特征的名称 algorithm = {} if flag=='whole': tmp_sta,tmp_rf,tmp_gbdt,tmp_extra={},{},{},{} for n in range(10): #stability rlasso = RandomizedLasso(random_state=n) rlasso.fit(data, mark) tmp_sta = add(tmp_sta,rank_to_dict(np.abs(rlasso.scores_), names,cv=True)) #rf rf = RandomForestClassifier(random_state=n) rf.fit(data, mark) tmp_rf = add(tmp_rf,rank_to_dict(rf.feature_importances_, names,cv=True)) #GBDT gbdt=GradientBoostingClassifier(random_state=n) gbdt.fit(data, mark) tmp_gbdt = add(tmp_gbdt, rank_to_dict(gbdt.feature_importances_, names, cv=True)) #Extra model = ExtraTreesClassifier(random_state=n) model.fit(data, mark) tmp_extra = add(tmp_extra, rank_to_dict(model.feature_importances_, names, cv=True)) algorithm["stability"],algorithm["RF"],algorithm["GBDT"],algorithm["Extra"] \ = tmp_sta,tmp_rf,tmp_gbdt,tmp_extra #MIC mine = MINE() mic_scores = [] res=[] for i in range(len(data[0])): for num in data: res.append(num[i]) mine.compute_score(res, mark) m = mine.mic() mic_scores.append(m) res = [] algorithm["MIC"] = rank_to_dict(mic_scores, names) #线性回归 lr = LinearRegression(normalize=True) lr.fit(data, mark) algorithm["Linear"] = rank_to_dict(np.abs(lr.coef_), names) #ridge ridgecv = RidgeCV() ridgecv.fit(data, mark) #print(ridgecv.alpha_) ridge = Ridge(alpha=ridgecv.alpha_) ridge.fit(data, mark) algorithm["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) #lasso lassocv = LassoCV() lassocv.fit(data, mark) #print(lassocv.alpha_) lasso = Lasso(alpha=lassocv.alpha_) lasso.fit(data, mark) algorithm["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) #rfe log=LogisticRegression() rfe = RFE(log, n_features_to_select=10) rfe.fit(data, mark) algorithm["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1) ''' #f值检验 f, pval = f_classif(data, mark) algorithm["Corr"] = rank_to_dict(f, names) ''' elif flag=='extra': model = ExtraTreesClassifier() model.fit(data, mark) algorithm["Extra"] = rank_to_dict(model.feature_importances_, names) elif flag=='gbdt': gbdt = GradientBoostingClassifier() gbdt.fit(data, mark) algorithm["GBDT"] = rank_to_dict(gbdt.feature_importances_, names) elif flag=='rf': rf = RandomForestClassifier() rf.fit(data, mark) algorithm["RF"] = rank_to_dict(rf.feature_importances_, names) r = {} for name in names: r[name] = round(np.mean([algorithm[method][name] for method in algorithm.keys()]), 4) methods = sorted(algorithm.keys()) algorithm["Mean"] = r methods.append("Mean") content=[] for name in names: content.append([algorithm[method][name] for method in methods]) fea_matrix = pd.DataFrame(content,index=names) fea_matrix.to_csv('/Users/hhy/Desktop/fea_importance_'+flag+'.csv',encoding='utf-8-sig',header=methods) return algorithm
def selectInputFeatures(configs, inputGenerator, igparams, tunedFeatures, error_function, runner, num_runs=5): permutation = [1, 2, 3, 4, 5] print('Starting the input feature selection process') # Build a set of configs that only change input parameters. # For the remaining parameters chose at random newConfigs = {} for key in configs.keys(): if key in tunedFeatures: newConfigs[key] = configs[key] else: newConfigs[key] = [random.choice(configs[key])] configList = extractAllConfigs(newConfigs) tot_runs = num_runs * len(permutation) + num_runs * len(configList) print('Requires {} executions'.format(tot_runs)) # Does permuting the data affect accuracy? result_set = [] perm_feat = False tmp_config = random.choice(configList)._asdict() for perm in permutation: configIGParams = igparams(tmp_config, 0) inputData = inputGenerator(*configIGParams) new_inputs = inputData.copy() random.shuffle(new_inputs) writeDataToFile(new_inputs, "_axprof_temp_input") # Averaging over a set of runs. error_tot = 0 for run in range(num_runs): results = runner("_axprof_temp_input", tmp_config) error_tot += error_function(new_inputs, results['acc']) sys.stdout.write('.') sys.stdout.flush() result_set.append(error_tot / num_runs) mine = MINE() mine.compute_score(permutation, result_set) perm_mic = mine.mic() if perm_mic > 0.9: perm_feat = True # Testing the other features result_set = {} for config in configList: # Setting the number to low value for now for input_num in range(5): inpAggregate = None configIGParams = igparams(config._asdict(), input_num) inputData = inputGenerator(*configIGParams) writeDataToFile(new_inputs, "_axprof_temp_input") error_tot = 0 for run in range(num_runs): sys.stdout.write('.') sys.stdout.flush() results = runner("_axprof_temp_input", tmp_config) error_tot += error_function(new_inputs, results['acc']) result_set[config] = error_tot / num_runs sys.stdout.write('\n') sys.stdout.flush() mics = {} for key in tunedFeatures: agg_y = {} for config in result_set: config_dict = config._asdict() if config_dict[key] in agg_y: agg_y[config_dict[key]].append(result_set[config]) else: agg_y[config_dict[key]] = [result_set[config]] unique_x = list(agg_y.keys()) y = [] for x in unique_x: y.append(np.mean(agg_y[x])) mine = MINE() mine.compute_score(unique_x, y) mics[key] = mine.mic() # Removing the variations in features that are not important for key in tunedFeatures: if mics[key] < 0.9: current = configs[key] configs[key] = [random.choice(current)] # Printing the report print('----------------------------------------') print('The results of input feature selection: ') print("Permuting the input: (MIC: {})".format(perm_mic)) for key in tunedFeatures: print("{}: (MIC: {})".format(key, mics[key])) print("Updated config list: ", configs) print('----------------------------------------') return configs
# feature reduction # when i dummied the dataframe i exploded into have over 16,000 features in a data set so small there was no value # i thought about the simpliest way to reduce the features and keep what i needed, and below is what I came up with dummyfraud_df.drop(list(dummyfraud_df.filter(regex = 'nameOrig')), axis = 1, inplace = True) dummyfraud_df.drop(list(dummyfraud_df.filter(regex = 'nameDest')), axis = 1, inplace = True) # feature selection for building the model print("######################## Below is my Pearson Cor ########################") print(dummyfraud_df.corr(method='pearson')["isFraud"].sort_values()) # As i raise the sample size what is becoming clear is the 'amount' feature has a high positive cor to the isFraud feature # the challenge is that the other features do not seem to be highly cor. I want to try to use the MIC processes like # i used on the hotel process print("######################## Below is my MIC Analysis ########################") mine = MINE(alpha=0.6, c=15) fraud_columns = list(dummyfraud_df) print(fraud_columns) for features in fraud_columns: mine.compute_score(dummyfraud_df[features], dummyfraud_df["isFraud"]) print("The MIC for feature " + str(features) + " is " + str(mine.mic())) # # Step 21. To split the dataset into features and target variables, first create a variable for the feature columns feature_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER'] # Set X equal to the feature columns X = dummyfraud_df[feature_cols] # Set Y equal to the target variable y = dummyfraud_df.isFraud # Using the train_test_split() function, split the data into test and train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # number of samples in each set
dfi = readWSDIndexFile(baseDir, stockCodes[i], startYear, number) dfi['R'] = R print np.shape(df), np.shape(dfi) allDF = pd.concat([df, dfi], axis=1) scaler = preprocessing.MinMaxScaler() X_Standard = scaler.fit_transform(df) X_Standard_T = np.transpose(X_Standard) Xi_Standard = scaler.fit_transform(dfi) Xi_Standard_T = np.transpose(Xi_Standard) X_ALL_Standard = scaler.fit_transform(allDF) X_ALL_Standard_T = np.transpose(X_ALL_Standard) print np.shape(X_ALL_Standard_T) mine = MINE(alpha=0.6, c=15, est="mic_approx") mics = [] # mine.compute_score(df['Close'].values, df['R'].values); print mine.mic() # # for i in range(0,10): # # mine.compute_score(X_Standard_T[i], X_Standard_T[10]) # # mics.append(mine.mic()) # # print i, mine.mic() # for i in [7,9]: # mine.compute_score(X_Standard_T[i], X_Standard_T[10]) # mics.append(mine.mic()) # print i, mine.mic() # # for i in range(0,38): # # mine.compute_score(Xi_Standard_T[i], Xi_Standard_T[38]) # # mics.append(mine.mic()) # # print i, mine.mic() # for i in range(0,7):
n_selected = n x_selected = x[:, J >= J_sort[n_features - n_selected]] return x_selected, J >= J_sort[n_features - n_selected] n_selected = 20 #可以改动 为1 5 10 20 50 100 x_fisher_selected, fisher_boollist = fisher(x_train, y_train, n_selected) logreg = linear_model.LogisticRegression(solver='lbfgs', max_iter=3000) logreg.fit(x_fisher_selected, y_train) x_fisher_test = x_test[:, fisher_boollist] print("基于类间类内距离特征选择后,利用logstics回归的准确率是:", logreg.score(x_fisher_test, y_test)) logreg.fit(x_train, y_train) print("直接利用logstics回归的准确率是:", logreg.score(x_test, y_test)) m = MINE() mic = [] n_features = x_raw.shape[1] for i in range(x_raw.shape[1]): x = x_train[:, i] m.compute_score(x, y_train) mic.append(m.mic()) mic_sorted = sorted(mic) mic = np.array(mic) x_mic_selected = x_train[:, mic >= mic_sorted[n_features - n_selected]] mic_boollist = mic >= mic_sorted[n_features - n_selected] x_mic_test = x_test[:, mic_boollist] logreg.fit(x_mic_selected, y_train) print("基于最大信息系数特征选择后,利用logstics回归的准确率是:", logreg.score(x_mic_test, y_test)) num = 0
def fit(self,X,y): # initialize phi and feature set # if number of features is not set, half of the features will be selected n = self.n beta = self.beta verbose = self.verbose if n ==None: n = int(X.shape[0]/2) features = np.arange(X.shape[1]).tolist() best_mi = -np.inf X_hat = 0 for xi in features: m = MINE() m.compute_score(X[:,xi],y) #compute I(xi,y) and get max xi mi_xi_y = m.mic() if best_mi<mi_xi_y: X_hat = xi phi = [X_hat] features.remove(X_hat) # get paris for elements in phi and features while len(phi)<n: mi_scores = np.zeros(len(features)) for xi_idx,xi in enumerate(features): m = MINE() m.compute_score(X[:,xi],y) #compute I(xi,y) mi_xi_y = m.mic() sum_mi_xi_xj = 0 for xj in phi: # compute I(xi,xj) and save for further evaluation m = MINE() m.compute_score(X[:,xi],X[:,xj]) mi_xi_xj = m.mic() sum_mi_xi_xj+=mi_xi_xj mi_scores[xi_idx] = mi_xi_y - beta*sum_mi_xi_xj if verbose>=2: print "mi_scores for xi:{xi}, xj:{xj} is {mi_scores}".format(xi=xi,xj=xj,mi_scores=mi_scores[xi_idx]) X_hat = np.argmax(mi_scores) if verbose==1: print "X_hat is {X_hat}".format(X_hat=X_hat) X_hat = features[X_hat] phi.append(X_hat) features.remove(X_hat) self.phi = phi self.features = features
rf = RandomForestClassifier() rf.fit(data, mark) algorithm["RF"] = rank_to_dict(rf.feature_importances_, names) #GBDT gbdt = GradientBoostingClassifier() gbdt.fit(data, mark) algorithm["GBDT"] = rank_to_dict(gbdt.feature_importances_, names) #Extra model = ExtraTreesClassifier() model.fit(data, mark) algorithm["Extra"] = rank_to_dict(model.feature_importances_, names) #MIC mine = MINE() mic_scores = [] res = [] for i in range(len(data[0])): for num in data: res.append(num[i]) mine.compute_score(res, mark) m = mine.mic() mic_scores.append(m) res = [] algorithm["MIC"] = rank_to_dict(mic_scores, names) #线性回归 lr = LinearRegression(normalize=True) lr.fit(data, mark) algorithm["Linear"] = rank_to_dict(np.abs(lr.coef_), names)
from minepy import MINE from scipy.stats import pearsonr, spearmanr import nlcor import numpy as np import matplotlib.pyplot as plt import time mine = MINE() n = 10000 m = 100 # 1% noise start = time.time() # 시작시간 저장 # x, y는 1차원 실수형 array """ random하게 데이터 생성 후 상관관계 확인 참고 - https://datascienceschool.net/view-notebook/ff367da95afc43ed8ae6ec30efc0fb9f/ """ plt.figure(figsize=(8, 6)) #plt.subplot(231) x1 = np.random.uniform(-50, 50, n) #x1 = np.random.uniform(-50, 50, n) y1 = 2 * x1**2 + np.random.uniform(-50, 50, n) #plt.scatter(x1, y1) #mine.compute_score(x1, y1) #print("random - x1, y1", mine.mic()) #plt.title("MIC={0:0.3f}".format(mine.mic())) #plt.subplot(232) x2 = np.random.uniform(-50, 50, n)
res = [] res.append(pd.read_csv("./avg_xgbs_discret_feature_5.csv").score.values) res.append(pd.read_csv("./R_7199.csv").score.values) res.append(pd.read_csv("./rank_feature_xgb_ensemble.csv").score.values) res.append(pd.read_csv("./avg_xgbs_discret_feature_10.csv").score.values) res.append(pd.read_csv("./based_on_select_rank_feature.csv").score.values) res.append(pd.read_csv("./xgb717.csv").score.values) res.append(pd.read_csv("./725.csv").score.values) res.append(pd.read_csv("./svm6938.csv").score.values) cm = [] for i in range(8): tmp = [] for j in range(8): m = MINE() m.compute_score(res[i], res[j]) tmp.append(m.mic()) cm.append(tmp) import numpy as np import matplotlib.pyplot as plt def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(8) plt.xticks(tick_marks, fs, rotation=45)
logfiles=glob.glob('log_dir/*') print "Directory exists:", log_dir for f in logfiles: os.remove(f) print "Removing all log files..." else: print "Creating directory:", log_dir makedirs(log_dir) sys.stdout = Log(sys.stdout, output_dir+'/'+mla+'_Report_'+str(GPS)+'_'+nfilename+'.log') profile_filename=log_dir+'/'+'Profiling_'+mla+'_'+str(GPS)+'_'+nfilename+'.result' prof=hotshot.Profile(profile_filename) prof.start() mine=MINE(alpha=0.6, c=15) f_input1=np.loadtxt(input_file1+'.txt') f_input2=np.loadtxt(input_file2+'.txt') if len(f_input1) < len(f_input1.T): f1=f_input1 else: f1=f_input1.T if len(f_input2) < len(f_input2.T): f2=f_input2 else: f2=f_input2.T Mdim=len(f1) Mat=np.zeros((Mdim, Mdim))
def MIC(x, y): mine = MINE(alpha=0.6, c=5) mine.compute_score(x, y) return mine.mic()
def f5(): return np.sin(16*np.pi*x) + noise * (i/n_noise) * r def f6(): return x**(1/4) + noise * (i/n_noise) * r def f7(): return (2*np.random.binomial(1, 0.5, n)-1) * (np.sqrt(1-(2*x-1)**2)) \ + (noise/4) * (i/n_noise) * r def f8(): return (x > 0.5) + noise * 5 * (i/n_noise) * r ff = [f1, f2, f3, f4, f5, f6, f7, f8] mine = MINE(alpha=mine_alpha, c=mine_c) mic_power = np.empty((len(ff), n_noise)) gmic_power = np.empty((len(ff), n_noise)) r2_power = np.empty((len(ff), n_noise)) np.random.seed(0) for i in range(1, n_noise+1): for j, f in enumerate(ff): mic_null, gmic_null, r2_null = [], [], [] mic_alt, gmic_alt, r2_alt = [], [], [] # null hypothesis for k in range(1, n_null+1): print i, j, k x = np.random.rand(n) r = np.random.randn(n) y = f()
x='y', data=df, order=df.sort_values('y', ascending=False).x.to_list(), color='blue') for p in splot.patches: splot.annotate(format(p.get_width(), '.3f'), (p.get_width(), p.get_y() + p.get_height() / 2.), ha='center', va='center', xytext=(15, 0), textcoords='offset points') #mic tic dcor rdc from minepy import MINE m = MINE() def mic(x, y): m.compute_score(x, y) return m.mic() def tic(x, y): m.compute_score(x, y) return m.tic(norm=True) from scipy.spatial.distance import pdist, squareform
def mic(x, y): m = MINE() m.compute_score(x, y) return (m.mic(), 0.5)
import numpy as np from minepy import MINE def print_stats(mine): print "MIC", mine.mic() print "MAS", mine.mas() print "MEV", mine.mev() print "MCN (eps=0)", mine.mcn(0) print "MCN (eps=1-MIC)", mine.mcn_general() x = np.linspace(0, 1, 10) y = np.sin(2*x) + x print(x) print(y) mine = MINE(alpha=0.6, c=15) mine.compute_score(x, y) print "Without noise:" print_stats(mine) print np.random.seed(0) y +=np.random.uniform(-1, 1, x.shape[0]) # add some noise mine.compute_score(x, y) print "With noise:" print_stats(mine)
def f5(): return np.sin(16*np.pi*x) + noise * (i/n_noise) * r def f6(): return x**(1/4) + noise * (i/n_noise) * r def f7(): return (2*np.random.binomial(1, 0.5, n)-1) * (np.sqrt(1-(2*x-1)**2)) \ + (noise/4) * (i/n_noise) * r def f8(): return (x > 0.5) + noise * 5 * (i/n_noise) * r ff = [f1, f2, f3, f4, f5, f6, f7, f8] mine_approx = MINE(alpha=mine_alpha, c=mine_c, est="mic_approx") mine_e = MINE(alpha=mine_alpha, c=mine_c, est="mic_e") mic_approx_power = np.empty((len(ff), n_noise)) mic_e_power = np.empty((len(ff), n_noise)) tic_e_power = np.empty((len(ff), n_noise)) r2_power = np.empty((len(ff), n_noise)) np.random.seed(0) for i in range(1, n_noise+1): for j, f in enumerate(ff): print "Noise: %d, function: %d" % (i, j) mic_approx_null, mic_e_null, tic_e_null, r2_null = [], [], [], [] mic_approx_alt, mic_e_alt, tic_e_alt, r2_alt = [], [], [], []
size = 300 x = np.random.normal(0, 1, size) #normal(mean,stdev,size) 高斯数 print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size)) print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size)) #明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0 a = np.random.uniform(-1, 1, 100000) #uniform(low,high,size) 随机数 print pearsonr(a, a**2)[0] #1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1] #互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式, #然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。 from minepy import MINE # m = MINE() x = np.random.uniform(-1, 1, 10000) m.compute_score(x, x**2) print m.mic() #1.3 距离相关系数 (Distance correlation),[0,1] #距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中,即便Pearson相关系数是0, #我们也不能断定这两个变量是独立的(有可能是非线性相关);但如果距离相关系数是0,那么我们就可以说这两个变量是独立的。 import numpy as np def dist(x, y): #1d only return np.abs(x[:, None] - y)
def maximal_information_coefficient(self): mine = MINE() mine.compute_score(self.var1, self.var2) m = mine.mic() return m
def MICvalue(): choice_user = request.get_json() # 获取前端用户选择的数据 flag = True # data = request.get_json() #bytes # print(data) choice0 = {} choice1 = {} # choice[0]['db'] = data[0][db] # choice[0]['col'] = data[0][col] # choice[0]['field'] = data[0][field] # choice[1]['db'] = data[1][db] # choice[1]['col'] = data[1][col] # choice[1]['field'] = data[1][field] choice0['db'] = choice_user[0][0] choice0['col'] = choice_user[0][1] choice0['field'] = choice_user[0][2] choice1['db'] = choice_user[1][0] choice1['col'] = choice_user[1][1] choice1['field'] = choice_user[1][2] print("choice0", choice0) print("choice1", choice1) # choice0['db'] = 'EpidemicData' # choice0['col'] = '上海' # choice0['field'] = '新增确诊' # choice1['db'] = 'EpidemicData' # choice1['col'] = '河北' # choice1['field'] = '新增确诊' # print(choice0) # print(choice1) # 获取数据 # client = MongoClient("10.72.100.5",8027,username='******',password='******') client = MongoClient("10.72.100.5", 8027) db = client.admin db.authenticate("double", "double") conn = MongoClient(host='mongodb://10.72.100.5:8027/' + 'admin', username='******', password='******') database = conn[choice0['db']] collection0 = database[choice0['col']] results0 = collection0.find({}, { choice0['field']: 1, "_id": 0 }).sort("_id", pymongo.ASCENDING) # 按照_id排序 collection1 = database[choice1['col']] results1 = collection1.find({}, { choice1['field']: 1, "_id": 0 }).sort("_id", pymongo.ASCENDING) # 按照_id排序 # 1表示显示此字段,0表示不显示此字段,默认会显示_id rawdata0 = [] rawdata1 = [] for result in results0: rawdata0.append(result[choice0['field']]) for result in results1: rawdata1.append(result[choice1['field']]) # 清理数据 for i in range(len(rawdata0) - 1, -1, -1): # 假定rawdata0与rawdata1的长度相同 if rawdata0[i] and rawdata1[i]: try: # 将数字形式的数据转换为浮点数 rawdata0[i] = float(rawdata0[i]) rawdata1[i] = float(rawdata1[i]) except ValueError: flag = False # 存在非数值字段 else: del rawdata0[i] del rawdata1[i] print("rawdata0", rawdata0) print("rawdata1", rawdata1) # 计算MIC m = MINE() if rawdata0: # 当rawdata0与rawdata1不为空时 if flag: # 将数据映射到[0,1]区间 min_max_scaler = MinMaxScaler() data1_std = min_max_scaler.fit_transform( np.array(rawdata0).reshape(-1, 1)) data2_std = min_max_scaler.fit_transform( np.array(rawdata1).reshape(-1, 1)) data1 = data1_std.reshape(1, -1)[0] data2 = data2_std.reshape(1, -1)[0] m.compute_score(data1, data2) # str(m.mic()) return json.dumps(m.mic()) else: return "请选取数值字段" else: return "您所选取的两个字段无对应数据"
# %% D = [] for theorem, strategy_results in strategy_evaluation_training_data.items(): scores = [(1.0/strategy_results[str(i)][1]) if strategy_results[str(i)][0] else MINIMUM_SCORE for i in range(NUM_STRATEGIES)] k = tuple(theorem.split('/')) try: D.append((problem_features.loc[[k]].iloc[0].tolist(), scores)) except Exception as e: print(e, type(e)) pass # %% DFX = np.array([d[0] for d in D]) DFY = [np.array([d[1][i] for d in D]) for i in range(5)] mine = MINE() mics = [] for j in range(NUM_STRATEGIES): for i in range(DFX.shape[1]): mine.compute_score(DFX[:, i],DFY[j]) mics.append((features[i], j, mine.mic())) #mics.append((features[i], mine.mic(), abs(pearsonr(DFX[:, i], DFY[0])[0]))) # %% import csv with open('data/scores.csv', mode='w') as f: cw = csv.writer(f) cw.writerow(['x', 'y', 'score']) for m in mics: cw.writerow([str(v) for v in m])
def performMIC(transposed_list): mic_scores=[] for counter1 in range(0, len(transposed_list)-1): for counter2 in range(counter1+1, len(transposed_list)): mine = MINE(alpha=0.6, c=15) mine.compute_score(transposed_list[counter1], transposed_list[counter2]) if (mine.mic()>0.6): mic_score={} mic_score['x']=counter1 mic_score['y']=counter2 mic_score['mic']=mine.mic() mic_scores.append(mic_score) return mic_scores