def __init__(self, personLoader,person): print("loading person ",person) AModel.__init__(self,personLoader) classificatorName = str(self.personLoader.classificator.name) #load all features & keep them in memory self.y = load('global_y_per_person' + classificatorName +'_p' + str(person)) if self.y == None: print('[Warn] Rebuilding cache') self.X, self.y = self.personLoader.load(person) dump(self.X,'global_X_per_person_p' + str(person)) dump(self.y,'global_y_per_person' + classificatorName + '_p' + str(person)) else: self.X = load('global_X_per_person_p' +str(person)) self.X = np.array(self.X) self.y = np.array(self.y) for index,val in enumerate(np.std(self.X,axis=0)): if val == 0: print('warning zero std for feature index: ', index, ' (', personLoader.featureExtractor.getFeatureNames()[index]) #manual Feature standardization self.X = self.X - np.average(self.X,axis=0) self.X = np.true_divide( self.X, np.std(self.X,axis=0) )
def getAnalytics(): t0 = time.time() feat_corr = load("valence_feat_pval") if feat_corr == None: feat_corr = valenceCorrelationWorker() dump(feat_corr, "valence_feat_pval") # here feat_corr has a list of correlations of each feature for each person => dim reduc & 3D plot X_3D = PCA(n_components=3).fit_transform(feat_corr) X_2D = PCA(n_components=2).fit_transform(feat_corr) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) ax.scatter(X_3D[:, 0], X_3D[:, 1], X_3D[:, 2]) plt.title("valence correlations for different persons 3D") plt.show() plt.clf() plt.scatter(X_2D[:, 0], X_2D[:, 1]) plt.title("valence correlations for different persons 2D") plt.show() t1 = time.time() print("valence complete, time spend: " + str(t1 - t0)) feat_corr = load("arousal_feat_pval") if feat_corr == None: feat_corr = arousalCorrelationWorker() dump(feat_corr, "arousal_feat_pval") # here feat_corr has a list of correlations of each feature for each person => dim reduc & 3D plot X_3D = PCA(n_components=3).fit_transform(feat_corr) X_2D = PCA(n_components=2).fit_transform(feat_corr) fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) ax.scatter(X_3D[:, 0], X_3D[:, 1], X_3D[:, 2]) plt.title("arousal correlations for different persons 3D") plt.show() plt.clf() plt.scatter(X_2D[:, 0], X_2D[:, 1]) plt.title("arousal correlations for different persons 2D") plt.show() t2 = time.time() print("arousal complete, time spend: " + str(t2 - t1)) print("total time spend: " + str(t2 - t0))
def probVsDim(self): data = [ [ [] for i in range(len(self.accs[0][0])) ] for j in range(len(self.accs[0])) ] for person_index, person in enumerate(self.accs): for model_index, model in enumerate(person): for metric_index, metric in enumerate(model): #[best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont] for (pred_lbl, probs, test_lbl, value) in zip(metric[-4], metric[-3], metric[-2], metric[-1]): if pred_lbl == test_lbl: data[model_index][metric_index].append(( probs[pred_lbl], np.abs(value - 5) )) dump(data, "corrData", path=self.ddpad) f = open(self.rpad + "corrs.csv", 'w') # header line f.write('model;') for name in self.metricnames: f.write(name + ';') f.write('\n') #much efficiency wow </sarcasm> modelNames = ['SVM','RF'] for model_index, model in enumerate(data): f.write(modelNames[model_index] + ';') for metric in model: probs = [] dists = [] for tup in metric: probs.append(tup[0]) dists.append(tup[1]) corr = pearsonr(dists, probs)[0] f.write(str(corr) + ';') f.write('\n') f.close()
def run(self,person, criterion): #http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html classificatorName = str(self.personLoader.classificator.name) #load all features & keep them in memory y = load('global_y_per_person' + classificatorName +'_p' + str(person)) if y == None: print('[Warn] Rebuilding cache') X, y = self.personLoader.load(person) dump(X,'global_X_per_person_p' + str(person)) dump(y,'global_y_per_person' + classificatorName + '_p' + str(person)) else: X = load('global_X_per_person_p' +str(person)) #grow forest forest = RandomForestClassifier( n_estimators=5000, max_features='auto', criterion=criterion, n_jobs=-1, random_state=0 ) normalize(X,copy=False) #fit forest forest.fit(X,y) #get importances importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) indices = np.argsort(importances)[::-1] featNames = self.personLoader.featureExtractor.getFeatureNames() return { 'classificatorName' : classificatorName, 'featNames' : featNames, 'importances' : importances, 'std' : std, 'indices' : indices, #[ index of first, index of second] ... 'criterion' : criterion }
def probVsDim(self): data = [ [] for j in range(len(self.accs)) ] for metric_index, metric in enumerate(self.accs): # [best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont] for (pred_lbl, probs, test_lbl, value) in zip(metric[-4], metric[-3], metric[-2], metric[-1]): if pred_lbl == test_lbl: data[metric_index].append([ probs[pred_lbl], np.abs(value - 5) ]) dump(data, "corrData", path=self.ddpad) f = open(self.rpad + "corrs.csv", 'w') # header line f.write('metric;') for name in self.metricnames: f.write(name + ';') f.write('\n') # much efficiency wow </sarcasm> f.write('SVM' + ';') for metric in data: metric = np.array(metric) probs = metric[:,0] dists = metric[:,1] corr = pearsonr(dists, probs)[0] f.write(str(corr) + ';') f.write('\n') f.close()
def run(self,criterion): #http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html classificatorName = str(self.personLoader.classificator.name) featNames = np.array(self.personLoader.featureExtractor.getFeatureNames()) #load all features & keep them in memory y = load('global_y_allpersons' + classificatorName) if y == None: print('[Warn] Rebuilding cache') X, y = self.personLoader.load() dump(X,'global_X_allpersons') dump(y,'global_y_allpersons' + classificatorName) else: X = load('global_X_allpersons') self.getIntermediateResult(criterion,X,y,'all features') #step1 importances, std, indices = self.step1(criterion,X,y) #step2 indices = self.step2(indices) self.getIntermediateResult(criterion,X[:,indices],y,'80%') #step 3 add features one by one acc_list, best_count, best_metric = self.step3(criterion,indices,X,y,10,featNames) return { 'classificatorName' : classificatorName, 'featNames' : self.personLoader.featureExtractor.getFeatureNames(), 'global_importances' : importances, 'global_std' : std, 'global_indices' : indices, #[ index of first, index of second] ... 'criterion' : criterion }
def run(self): for person in range(1, self.stopperson + 1): # load person data y_cont = load('cont_y_p' + str(person), path=self.ddpad) if y_cont == None: print('[Warn] Rebuilding cache - person ' + str(person)) personLdr = personLoader.NoTestsetLoader(self.classifier, self.featExtr) X, y_cont = personLdr.load(person) dump(X, 'X_p' + str(person), path=self.ddpad) dump(y_cont, 'cont_y_p' + str(person), path=self.ddpad) else: X = load('X_p' + str(person), path=self.ddpad) # manual Feature standardization X = X - np.average(X, axis=0) X = np.true_divide(X, np.std(X, axis=0)) self.X[person - 1] = np.array(X) self.y_cont[person - 1] = np.array(y_cont) y_disc = np.array(y_cont) y_disc[y_disc <= 5] = 0 y_disc[y_disc > 5 ] = 1 self.y_disc[person - 1] = y_disc self.results = self.getMetrics() self.genReport() self.accs = self.getAccs() self.probVsDim() self.genAccReport()
def run(self): #create 32 classifiers print('initialising the 32 classifiers ...') stop_person = 33 if stop_person < 33: print("[warn] not using all persons") pool = Pool(processes=POOL_SIZE) self.classifiers = pool.map( self.initClassifiers, range(1,stop_person) ) pool.close() pool.join() to_keep = {'all_orig_featureNames' : self.classifiers[0].personLoader.featureExtractor.getFeatureNames(), 'criterion': self.classifiers[0].criterion, 'classificatorName' : self.classifiers[0].personLoader.classificator.name, 'stop_person' : stop_person, 'threshold' : self.threshold } #step 1: variable ranking: get all importances of the different features print('step 1: getting importances') importances = load('importances_once') if importances == None: print("[warn] rebuilding importances") pool = Pool(processes=POOL_SIZE) importances = np.array( pool.map( self.getImportance, range(1,stop_person) )) pool.close() pool.join() dump(importances, 'importances_once') to_keep['all_importances'] = importances avg_importances = np.average(importances[:,:,1], axis=0) std_importances = [ np.std(importances[:,i,1]) for i in range(len(importances[0]))] to_keep['avg_importances'] = avg_importances to_keep['std_importances'] = std_importances #step 2: elimintaion: remove everything below threshold, a.k.a. keep everything above the threshold print('step 2: filtering features - threshold') indexes_to_keep = [i for i, val in enumerate(avg_importances) if val > self.threshold] for c in self.classifiers: c.filterFeatures(indexes_to_keep) avg_importances = avg_importances[indexes_to_keep] #cleanup memory to_keep['step1_featureNames'] = self.classifiers[0].personLoader.featureExtractor.getFeatureNames() to_keep['step1_avg_importances'] = avg_importances #sort features indices = np.array(np.argsort(avg_importances)[::-1]) for c in self.classifiers: c.filterFeatures(indices) #add features one by one and select smallest, lowest oob model print('building tree') highest_oob_score = 0 highest_index = 1 oob_scores = [] for i in range(1,len(indices)+1): self.count = i pool = Pool(processes=POOL_SIZE) oob_errors = pool.map( self.getOOBErrors, range(1,stop_person) ) pool.close() pool.join() avg_oob = np.average(oob_errors, axis=0) print("feat Count: " + str(i) + " - error: " + str(avg_oob)) oob_scores.append(avg_oob) if avg_oob > highest_oob_score: #TODO std highest_oob_score = avg_oob highest_index = i to_keep['step2_oob_scores'] = oob_scores #select smallest tree with lowest error => lowest index & lowest error for c in self.classifiers: c.filterFeatures( [c for c in range(highest_index)] ) to_keep['step3_size'] = highest_index to_keep['step3_oob'] = highest_oob_score print("selected tree size:" + str(highest_index) + ' - oob: ' + str(highest_oob_score)) print('final building phase') #restart with an empty tree and add features one by one, only keep features that decrease the error with a certain threshold prev_oob = 0 used_features = [] used_accs = [] for i in range(highest_index): self.count = i + 1 pool = Pool(processes=POOL_SIZE) oob_errors = pool.map( self.getOOBErrors, range(1,stop_person) ) pool.close() pool.join() avg_oob = np.average(oob_errors, axis=0) print("feat Count: " + str(i) + " - error: " + str(avg_oob)) if avg_oob - self.threshold > prev_oob or prev_oob == 0: prev_oob = avg_oob used_features.append(i) used_accs = oob_errors to_keep['step4_used_accs'] = used_accs to_keep['step4_features'] = used_features for c in self.classifiers: c.filterFeatures( [c for c in range(highest_index)] ) to_keep['step4_featureNames'] = self.classifiers[0].personLoader.featureExtractor.getFeatureNames() dump(to_keep,'to_keep') return to_keep
def getAccs(self, person): print('gettings accs for person ' + str(person)) to_ret = load('accs_p' + str(person), path = self.ddpad) if to_ret == None: # load person data # load all features & keep them in memory y = load('cont_y_p' + str(person), path=self.ddpad) if y == None: print('[Warn] Rebuilding cache - person ' + str(person)) personLdr = personLoader.NoTestsetLoader(self.classifier, self.featExtr) X, y = personLdr.load(person) dump(X, 'X_p' + str(person), path=self.ddpad) dump(y, 'cont_y_p' + str(person), path=self.ddpad) else: X = load('X_p' + str(person), path=self.ddpad) y = np.array(y) # manual Feature standardization X = X - np.average(X, axis=0) X = np.true_divide(X, np.std(X, axis=0)) #train test set # Train / testset X, X_test, y, y_test_cont = train_test_split(X,y,test_size=10, random_state=17) y[y <= 5] = 0 y[y > 5] = 1 y_test = np.array(y_test_cont) y_test[y_test <= 5] = 0 y_test[y_test > 5] = 1 to_ret = [] for model_index, model in enumerate([ #SVC(kernel='linear'), SVC(kernel='rbf') #KNN(n_neighbors=3), #KNN(n_neighbors=5), #KNN(n_neighbors=7), #lda needs two features #RandomForestClassifier( # n_estimators=1000, # max_features='auto', # criterion='gini', # n_jobs=-1, #) ]): model_to_ret = [] for metric in self.results[person-1]: featNames = np.array(self.featExtr.getFeatureNames()) #take clean copy #sort features indices = np.array(np.argsort(metric)[::-1]) #take top threshold indices = indices[:self.threshold] #apply X_model = np.array(X[:,indices]) X_model_test = np.array(X_test[:,indices]) featNames = featNames[indices] best_feat, best_featNames = [], [] all_scores, all_stds = [],[] best_score, best_std = 0, 0 for i in range(self.threshold): to_keep = best_feat[:] to_keep.append(i) X_temp = np.array(X_model[:,to_keep]) # get scores run_scores = [] for tr, te in KFold(n=len(X_temp), n_folds=5, shuffle=True, random_state=17): model.fit(X_temp[tr], y[tr]) run_scores.append(self.accuracy(model.predict(X_temp[te]), y[te])) new_score = np.average(run_scores) new_std = np.std(run_scores) all_scores.append(new_score) all_stds.append(new_std) # better? if new_score - new_std > best_score - best_std: best_score = new_score best_std = new_std best_feat = to_keep best_featNames.append(featNames[i]) #get test score if model_index == 0: test_model = SVC(kernel='rbf', probability=True) test_model.fit(X_model[:,best_feat], y) X_model_test = np.array(X_model_test[:,best_feat]) test_pred = test_model.predict(X_model_test) test_prob = test_model.predict_proba(X_model_test) test_acc = self.accuracy(test_model.predict(X_model_test), y_test) else: test_model = RandomForestClassifier( n_estimators=2000, max_features='auto', criterion='gini', n_jobs=-1 ) test_model.fit(X_model[:, best_feat], y) X_model_test = np.array(X_model_test[:, best_feat]) test_pred = test_model.predict(X_model_test) test_prob = test_model.predict_proba(X_model_test) test_acc = self.accuracy(test_model.predict(X_model_test), y_test) model_to_ret.append([best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont]) to_ret.append(model_to_ret) dump(to_ret, 'accs_p' + str(person), path = self.ddpad) return to_ret
def runPers(self,person): #load person data pers_results = load('pers_res_p' + str(person), path=self.ddpad) if pers_results == None: pers_results = [] # load all features & keep them in memory y_cont = load('cont_y_p' + str(person), path=self.ddpad) if y_cont == None: print('[Warn] Rebuilding cache - person ' + str(person)) personLdr = personLoader.NoTestsetLoader(self.classifier, self.featExtr) X, y_cont = personLdr.load(person) dump(X, 'X_p' + str(person), path=self.ddpad) dump(y_cont, 'cont_y_p' + str(person), path=self.ddpad) else: X = load('X_p' + str(person), path=self.ddpad) y_disc = np.array(y_cont) y_disc[y_disc <= 5] = 0 y_disc[y_disc > 5] = 1 # manual Feature standardization X = X - np.average(X, axis=0) X = np.true_divide(X, np.std(X, axis=0)) #pearson corr = [] for index in range(len(X[0])): corr.append( pearsonr(X[:, index], y_cont)[0] ) pers_results.append(corr) #Mut inf #dcorr mi = [] dcorr = [] for feature in np.transpose(X): # normalized mutual information c_xy = np.histogram2d(feature, y_cont, 2)[0] entX = entropy(feature, y_cont) entY = entropy(feature, y_cont) nMutInf = mutual_info_score(None, None, contingency=c_xy) / float(np.sqrt(entX * entY)) mi.append(nMutInf) # Distance Correlation dc, dr, dvx, dvy = self.dcov_all(feature, y_cont) dcorr.append(dr) pers_results.append(mi) pers_results.append(dcorr) #Linear Regression lr = LinearRegression(n_jobs=-1) lr.fit(X, y_cont) pers_results.append(lr.coef_) #Lasso Regression alphas = [0.03, 0.1, 0.3, 1, 3, 10] best_alpha = 0.01 best_acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] lasso = Lasso(alpha=best_alpha) lasso.fit(X_train, y_train) pred = lasso.predict(X_test) best_acc += self.accuracy(pred, y_cont) best_acc /= float(5) for alpha in alphas: acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] lasso = Lasso(alpha=alpha) lasso.fit(X_train, y_train) pred = lasso.predict(X_test) acc += self.accuracy(pred, y_test) acc /= float(5) if acc > best_acc: best_acc = acc best_alpha = alpha lasso = Lasso(alpha=best_alpha) lasso.fit(X, y_cont) pers_results.append(lasso.coef_) #Ridge Regression alphas = [0.03, 0.1, 0.3, 1, 3, 10] best_alpha = 0.01 best_acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] ridge = Ridge(alpha=best_alpha) ridge.fit(X_train, y_train) pred = ridge.predict(X_test) best_acc += self.accuracy(pred, y_test) best_acc /= float(5) for alpha in alphas: acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] ridge = Ridge(alpha=alpha) ridge.fit(X_train, y_train) pred = lasso.predict(X_test) acc += self.accuracy(pred, y_test) acc /= float(5) if acc > best_acc: best_acc = acc best_alpha = alpha ridge = Ridge(alpha=best_alpha) ridge.fit(X, y_cont) pers_results.append(ridge.coef_) #SVM clf = SVC(kernel='linear') clf.fit(X, y_disc) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= float(svm_weights.max()) pers_results.append(svm_weights) #Random Forests #rf importances #grow forestµ importances = [] for run in range(self.runs): forest = RandomForestClassifier( n_estimators=2000, max_features='auto', criterion='gini', n_jobs=-1, ) forest.fit(X,y_disc) importances.append(forest.feature_importances_) #stds.append( np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) ) pers_results.append(np.average(importances, axis=0)) pers_results.append(np.std(importances, axis=0)) #ANOVA anova = SelectKBest(f_regression, k=self.threshold) anova.fit(X,y_disc) selected_features = anova.get_support() pers_results.append(selected_features) #Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(n_components=1) lda.fit(X,y_disc) pers_results.append(lda.coef_[0]) #Principal Component Analysis pca = PCA(n_components=1) pca.fit(X) pers_results.append(pca.components_[0]) #absolute values pers_results = np.absolute(np.array(pers_results)) dump(pers_results, 'pers_res_p' + str(person), path=self.ddpad) return np.array(pers_results)
if __name__ == '__main__': stop_person = 33 if stop_person < 33: print('[warn] not using all persons!') # lda lda_results = load('results_valence_lda') if lda_results == None: print('[warn] rebuilding valence lda results cache') pool = Pool(processes=POOL_SIZE) lda_results = pool.map(ldaRankings, range(1, stop_person)) pool.close() pool.join() dump(lda_results, 'results_valence_lda') lda_results = np.array(lda_results) train_accs = np.array(lda_results[:, 0]) test_accs = np.array(lda_results[:, 1]) genLDAReport(train_accs, test_accs) results = load('results_valence') if results == None: print('[warn] rebuilding valence results cache') pool = Pool(processes=POOL_SIZE) results = pool.map(getPersonRankings, range(1, stop_person)) # results = pool.map(ldaRankings, range(1, stop_person)) pool.close() pool.join() dump(results,'results_valence')
+ str(person) + "interpretation - score: " + str(score_inter) + "(" + str(std_inter) + ")prediction - score: " + str(score_pred) + " - " + str(std_pred) ) to_ret = [ [featCount_inter, score_inter, std_inter, featureNames_inter], [len(indices_pred), score_pred, std_pred, featureNames_pred], ] dump(to_ret, "rf_P" + str(person)) return to_ret if __name__ == "__main__": pool = Pool(processes=POOL_SIZE) results = pool.map(RFPerson, range(1, STOPPERSON + 1)) pool.close() pool.join() dump(results, "RF_pers_specific") pprint(results)
def getAccs(self): to_ret = load('accs_all', path=self.ddpad) if to_ret == None: # Train / testset X, X_test, y, y_test_cont = train_test_split(self.X, self.y_cont,test_size=8, random_state=17) X = np.array(X) X_test = np.array(X_test) y = np.array(y) y_test_cont = np.array(y_test_cont) y[y <= 5] = 0 y[y > 5] = 1 y_test = np.array(y_test_cont) y_test[y_test <= 5] = 0 y_test[y_test > 5] = 1 to_ret = [] model = SVC(kernel='rbf', probability=True) for mindex, metric in enumerate(self.results): print('model' + str(0) + ' - metric' + str(mindex)) featNames = np.array(self.featExtr.getFeatureNames()) #take clean copy #sort features indices = np.array(np.argsort(metric)[::-1]) #take top threshold indices = indices[:self.threshold] #old struct if mindex == 0: X, y = self.fixStructure(X, y) junk, y_test_cont = self.fixStructure(np.array(X_test), y_test_cont) X_test, y_test = self.fixStructure(X_test, y_test) #Filter features X_model = np.array(X[:,indices]) featNames = featNames[indices] best_feat, best_featNames = [], [] all_scores, all_stds = [],[] best_score, best_std = 0, 0 for i in range(self.threshold): to_keep = best_feat[:] to_keep.append(i) X_temp = np.array(X_model[:,to_keep]) # get scores run_scores = [] X_temp, y = self.reverseFixStructure(X_temp, y) for tr, te in KFold(n=len(X_temp), n_folds=5, shuffle=True, random_state=17): X_t, y_t = self.fixStructure(X_temp[tr], y[tr]) X_te, y_te = self.fixStructure(X_temp[te], y[te]) model.fit(X_t, y_t) run_scores.append(self.accuracy(model.predict(X_te), y_te)) X_temp, y = self.fixStructure(X_temp, y) new_score = np.average(run_scores) new_std = np.std(run_scores) all_scores.append(new_score) all_stds.append(new_std) # better? if new_score - new_std > best_score - best_std: best_score = new_score best_std = new_std best_feat = to_keep best_featNames.append(featNames[i]) #get test score => old struct :D model.fit(X_model[:,best_feat], y) X_model_test = np.array(X_test[:, best_feat]) test_pred = model.predict(X_model_test) test_prob = model.predict_proba(X_model_test) test_acc = self.accuracy(test_pred, y_test) to_ret.append([best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont]) X, y = self.reverseFixStructure(X, y) X_test, y_test = self.reverseFixStructure(X_test, y_test) dump(to_ret, 'accs_all', path = self.ddpad) return to_ret
def getMetrics(self): metrics = load("all_metrics", path=self.ddpad) if metrics == None: X, y_cont = self.fixStructure(self.X, self.y_cont) y_disc = np.array(y_cont) y_disc[y_disc <= 5] = 0 y_disc[y_disc > 5] = 1 metrics = [] #pearson corr = [] for index in range(len(X[0])): corr.append( pearsonr(X[:, index], y_cont)[0] ) metrics.append(corr) #Mut inf #dcorr mi = [] dcorr = [] for feature in np.transpose(X): # normalized mutual information c_xy = np.histogram2d(feature, y_cont, 2)[0] entX = entropy(feature, y_cont) entY = entropy(feature, y_cont) nMutInf = mutual_info_score(None, None, contingency=c_xy) / float(np.sqrt(entX * entY)) mi.append(nMutInf) # Distance Correlation dc, dr, dvx, dvy = self.dcov_all(feature, y_cont) dcorr.append(dr) metrics.append(mi) metrics.append(dcorr) #Linear Regression lr = LinearRegression(n_jobs=-1) lr.fit(X, y_cont) metrics.append(lr.coef_) #Lasso Regression alphas = [0.03, 0.1, 0.3, 1, 3, 10] best_alpha = 0.01 best_acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] lasso = Lasso(alpha=best_alpha) lasso.fit(X_train, y_train) pred = lasso.predict(X_test) best_acc += self.accuracy(pred, y_cont) best_acc /= float(5) for alpha in alphas: acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] lasso = Lasso(alpha=alpha) lasso.fit(X_train, y_train) pred = lasso.predict(X_test) acc += self.accuracy(pred, y_test) acc /= float(5) if acc > best_acc: best_acc = acc best_alpha = alpha lasso = Lasso(alpha=best_alpha) lasso.fit(X, y_cont) metrics.append(lasso.coef_) #Ridge Regression alphas = [0.03, 0.1, 0.3, 1, 3, 10] best_alpha = 0.01 best_acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] ridge = Ridge(alpha=best_alpha) ridge.fit(X_train, y_train) pred = ridge.predict(X_test) best_acc += self.accuracy(pred, y_test) best_acc /= float(5) for alpha in alphas: acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] ridge = Ridge(alpha=alpha) ridge.fit(X_train, y_train) pred = lasso.predict(X_test) acc += self.accuracy(pred, y_test) acc /= float(5) if acc > best_acc: best_acc = acc best_alpha = alpha ridge = Ridge(alpha=best_alpha) ridge.fit(X, y_cont) metrics.append(ridge.coef_) #SVM clf = SVC(kernel='linear') clf.fit(X, y_disc) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= float(svm_weights.max()) metrics.append(svm_weights) #Random Forests #rf importances #grow forest importances = [] for run in range(self.runs): forest = RandomForestClassifier( n_estimators=2000, max_features='auto', criterion='gini', n_jobs=-1, ) forest.fit(X, y_disc) importances.append(forest.feature_importances_) # stds.append( np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) ) metrics.append(np.average(importances, axis=0)) metrics.append(np.std(importances, axis=0)) X, y_disc = self.reverseFixStructure(X, y_disc) forest.fit(X,y_disc) importances, stds = forest.getImportance() metrics.append(importances) metrics.append(stds) X, y_disc = self.fixStructure(X,y_disc) #ANOVA anova = SelectKBest(f_regression, k=self.threshold) anova.fit(X,y_disc) selected_features = anova.get_support() metrics.append(selected_features) #Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(n_components=1) lda.fit(X,y_disc) metrics.append(lda.coef_[0]) #Principal Component Analysis pca = PCA(n_components=1) pca.fit(X) metrics.append(pca.components_[0]) #absolute values metrics = np.absolute(np.array(metrics)) dump(metrics, 'all_metrics', path=self.ddpad) return np.array(metrics)
def ldaRankings(person): FOLDS = 5 # load all features & keep them in memory y_cont = load('cont_y_p' + str(person)) if y_cont == None: print('[Warn] Rebuilding cache - person ' + str(person)) classificator = Classificators.ContValenceClassificator() featExtr = getFeatures() personLdr = personLoader.NoTestsetLoader(classificator, featExtr) X, y_cont = personLdr.load(person) dump(X, 'X_p' + str(person)) dump(y_cont, 'cont_y_p' + str(person)) else: X = load('X_p' + str(person)) X = np.array(X) y_cont = np.array(y_cont) y_disc = y_cont y_disc[y_disc <= 5] = 0 y_disc[y_disc > 5] = 1 for index, val in enumerate(np.std(X, axis=0)): if val == 0: print('warning zero std for feature index: ', index, ' (', personLoader.featureExtractor.getFeatureNames()[index]) # manual Feature standardization X = X - np.average(X, axis=0) X = np.true_divide(X, np.std(X, axis=0)) feat_test_error = [] feat_train_error = [] X_temp = X[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]] #phy only feat_test_error.append(0) feat_train_error.append(0) for train_index, test_index in KFold(len(y_disc), n_folds=FOLDS, random_state=17, shuffle=True): X_train, X_test = X_temp[train_index], X_temp[test_index] y_train, y_test = y_disc[train_index], y_disc[test_index] clf = LDA(shrinkage='auto', solver='lsqr') clf.fit(X_train, y_train) feat_train_error[0] += accuracy(clf.predict(X_train), y_train) / float(FOLDS) feat_test_error[0] += accuracy(clf.predict(X_test) , y_test ) / float(FOLDS) print("train: " + str(feat_train_error[0]) + " test: " + str(feat_test_error[0])) for feat_index in range(10,30):#len(X[0,:])): index = feat_index - 9 X_temp = X[:, [0,1,2,3,4,5,6,7,8,9,feat_index]] feat_test_error.append(0) feat_train_error.append(0) for train_index, test_index in KFold(len(y_disc), n_folds=FOLDS, random_state=17, shuffle=True): X_train, X_test = X_temp[train_index], X_temp[test_index] y_train, y_test = y_disc[train_index], y_disc[test_index] clf = LDA(shrinkage='auto',solver='lsqr') clf.fit(X_train, y_train) feat_train_error[index] += accuracy(clf.predict(X_train), y_train) / float(FOLDS) feat_test_error[index] += accuracy(clf.predict(X_test) , y_test ) / float(FOLDS) print("train: " + str(feat_train_error[index]) + " test: " + str(feat_test_error[index])) return [feat_test_error, feat_train_error]
def getPersonRankings(person): #load all features & keep them in memory y_cont = load('cont_y_p' + str(person)) if y_cont == None: print('[Warn] Rebuilding cache - person ' + str(person)) classificator = Classificators.ContValenceClassificator() featExtr = getFeatures() personLdr = personLoader.NoTestsetLoader(classificator, featExtr) X, y_cont = personLdr.load(person) dump(X,'X_p' + str(person)) dump(y_cont,'cont_y_p' + str(person)) else: X = load('X_p' +str(person)) X = np.array(X) y_cont = np.array(y_cont) y_disc = y_cont y_disc[ y_disc <= 5 ] = 0 y_disc[ y_disc > 5 ] = 1 for index,val in enumerate(np.std(X,axis=0)): if val == 0: print('warning zero std for feature index: ', index, ' (', personLoader.featureExtractor.getFeatureNames()[index]) #manual Feature standardization X = X - np.average(X,axis=0) X = np.true_divide(X, np.std(X,axis=0) ) #statistical tests #get pearson corr = [] for index in range(len(X[0])): corr.append( pearsonr(X[:, index], y_cont) ) #model based: #normal regression lr = LinearRegression(n_jobs=-1) lr.fit(X, y_cont) lr_scores = lr.coef_ #l1 regression alphas = [0.03,0.1,0.3,1,3,10] best_alpha = 0.01 best_acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] lasso = Lasso(alpha=best_alpha) lasso.fit(X_train,y_train) pred = lasso.predict(X_test) best_acc += accuracy(pred,y_cont) best_acc /= float(5) for alpha in alphas: acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] lasso = Lasso(alpha=alpha) lasso.fit(X_train,y_train) pred = lasso.predict(X_test) acc += accuracy(pred,y_test) acc /= float(5) if acc > best_acc: best_acc = acc best_alpha = alpha lasso = Lasso(alpha=best_alpha) lasso.fit(X, y_cont) l1_scores = lasso.coef_ #l2 regression alphas = [0.03,0.1,0.3,1,3,10] best_alpha = 0.01 best_acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] ridge = Ridge(alpha=best_alpha) ridge.fit(X_train,y_train) pred = ridge.predict(X_test) best_acc += accuracy(pred,y_test) best_acc /= float(5) for alpha in alphas: acc = 0 for train_index, test_index in KFold(len(y_cont), n_folds=5): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_cont[train_index], y_cont[test_index] ridge = Ridge(alpha=alpha) ridge.fit(X_train,y_train) pred = lasso.predict(X_test) acc += accuracy(pred,y_test) acc /= float(5) if acc > best_acc: best_acc = acc best_alpha = alpha ridge = Ridge(alpha=best_alpha) ridge.fit(X, y_cont) l2_scores = ridge.coef_ #svm coefficients clf = svm.SVC(kernel='linear') clf.fit(X, y_disc) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= float(svm_weights.max()) #rf importances #grow forest forest = RandomForestClassifier( n_estimators=3000, max_features='auto', criterion='gini', n_jobs=-1, ) forest.fit(X,y_disc) #get importances importances = forest.feature_importances_ #std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) # pca coef pca = PCA(n_components=1) pca.fit(X) pca_coef = pca.components_[0] # [pearson_r, mutual inf, max inf, dist, l1 coef, l2 coef, svm coef, rf importances, coord search] #featExtr = getFeatures() #featnames = featExtr.getFeatureNames() pers_results = [] for corr, lr_scores, l1_scores, l2_scores, svm_weights, importances, pca_coef in zip(corr, lr_scores, l1_scores, l2_scores, svm_weights, importances, pca_coef): pers_results.append([np.abs(corr[0]), np.abs(lr_scores), np.abs(l1_scores), np.abs(l2_scores), np.abs(svm_weights), np.abs(importances), np.abs(pca_coef)]) return pers_results#, featExtr.featureExtrs
# run model results = model.run() return results def arousalWorker(criterion,treecount,threshold): featExtr = getFeatures() # create classificator classificator = Classificators.ArousalClassificator() # create personloader personLdr = personLoader.NoTestsetLoader(classificator, featExtr) # run model model = models.RFModel(personLoader=personLdr, criterion=criterion, treeCount=treecount, threshold=threshold) return results if __name__ == '__main__': treeCount = 2000 threshold = 0.002 reporter = reporters.HTMLRFModelReporter() results = load('to_keep') if results == None: results = valenceWorker('gini',treeCount,threshold) print("[warn] rebuilding cache") dump(results,'to_keep') reporter.genReport(results)
def RFPerson(person): print("person: " + str(person)) # load X , y # load all features & keep them in memory featExtr = getFeatures() featureNames = np.array(featExtr.getFeatureNames()) y_cont = load("cont_y_p" + str(person)) if y_cont == None: print("[Warn] Rebuilding cache - person " + str(person)) X, y_cont = personLoader.NoTestsetLoader( classificator=Classificators.ContValenceClassificator(), featExtractor=featExtr ).load(person) dump(X, "X_p" + str(person)) dump(y_cont, "cont_y_p" + str(person)) else: X = load("X_p" + str(person)) y_disc = np.array(y_cont) y_disc[y_disc <= 5] = 0 y_disc[y_disc > 5] = 1 # manual Feature standardization X = X - np.average(X, axis=0) X = np.true_divide(X, np.std(X, axis=0)) # step 1 determine importances using RF forest indices_step1, featureNames_step1 = step1(X, y_disc, featureNames) featureNames = np.array(featureNames_step1) indices = np.array(indices_step1) # filter features (X) based on the results from step 1 X = X[:, indices] # step 2 - interpretation featCount_inter, score_inter, std_inter = step2_interpretation(X, y_disc, featureNames) indices_inter = indices[:featCount_inter] featureNames_inter = featureNames[indices_inter] # step 2 - prediction indices_pred, score_pred, std_pred = step2_prediction(X, y_disc, featureNames) featureNames_pred = featureNames[indices_pred] print( "[" + str(person) + "interpretation - score: " + str(score_inter) + "(" + str(std_inter) + ")prediction - score: " + str(score_pred) + " - " + str(std_pred) ) to_ret = [ [featCount_inter, score_inter, std_inter, featureNames_inter], [len(indices_pred), score_pred, std_pred, featureNames_pred], ] dump(to_ret, "rf_P" + str(person)) return to_ret