Exemplo n.º 1
0
    def __init__(self, personLoader,person):
        print("loading person ",person)
        AModel.__init__(self,personLoader)

        classificatorName = str(self.personLoader.classificator.name)

        #load all features & keep them in memory
        self.y = load('global_y_per_person' + classificatorName +'_p' + str(person))
        if self.y == None:
            print('[Warn] Rebuilding cache')
            self.X, self.y = self.personLoader.load(person)
            dump(self.X,'global_X_per_person_p' + str(person))
            dump(self.y,'global_y_per_person' + classificatorName + '_p' + str(person))
        else:
            self.X = load('global_X_per_person_p' +str(person))

        self.X = np.array(self.X)
        self.y = np.array(self.y)

        for index,val in enumerate(np.std(self.X,axis=0)):
            if val == 0:
                print('warning zero std for feature index: ', index, ' (', personLoader.featureExtractor.getFeatureNames()[index])

        #manual Feature standardization
        self.X = self.X - np.average(self.X,axis=0)
        self.X = np.true_divide( self.X, np.std(self.X,axis=0) )
Exemplo n.º 2
0
def getAnalytics():
    t0 = time.time()

    feat_corr = load("valence_feat_pval")
    if feat_corr == None:
        feat_corr = valenceCorrelationWorker()
        dump(feat_corr, "valence_feat_pval")

    # here feat_corr has a list of correlations of each feature for each person => dim reduc & 3D plot
    X_3D = PCA(n_components=3).fit_transform(feat_corr)
    X_2D = PCA(n_components=2).fit_transform(feat_corr)

    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    ax.scatter(X_3D[:, 0], X_3D[:, 1], X_3D[:, 2])
    plt.title("valence correlations for different persons 3D")
    plt.show()

    plt.clf()
    plt.scatter(X_2D[:, 0], X_2D[:, 1])
    plt.title("valence correlations for different persons 2D")
    plt.show()

    t1 = time.time()
    print("valence complete, time spend: " + str(t1 - t0))

    feat_corr = load("arousal_feat_pval")
    if feat_corr == None:
        feat_corr = arousalCorrelationWorker()
        dump(feat_corr, "arousal_feat_pval")

    # here feat_corr has a list of correlations of each feature for each person => dim reduc & 3D plot
    X_3D = PCA(n_components=3).fit_transform(feat_corr)
    X_2D = PCA(n_components=2).fit_transform(feat_corr)

    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    ax.scatter(X_3D[:, 0], X_3D[:, 1], X_3D[:, 2])
    plt.title("arousal correlations for different persons 3D")
    plt.show()

    plt.clf()
    plt.scatter(X_2D[:, 0], X_2D[:, 1])
    plt.title("arousal correlations for different persons 2D")
    plt.show()

    t2 = time.time()
    print("arousal complete, time spend: " + str(t2 - t1))

    print("total time spend: " + str(t2 - t0))
Exemplo n.º 3
0
    def probVsDim(self):


        data = [
            [
                [] for i in range(len(self.accs[0][0]))
            ] for j in range(len(self.accs[0]))
        ]

        for person_index, person in enumerate(self.accs):
            for model_index, model in enumerate(person):
                for metric_index, metric in enumerate(model):
                    #[best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont]
                    for (pred_lbl, probs, test_lbl, value) in zip(metric[-4], metric[-3], metric[-2], metric[-1]):
                        if pred_lbl == test_lbl:
                            data[model_index][metric_index].append((
                                probs[pred_lbl],
                                np.abs(value - 5)
                                ))

        dump(data, "corrData", path=self.ddpad)

        f = open(self.rpad + "corrs.csv", 'w')

        # header line
        f.write('model;')
        for name in self.metricnames:
            f.write(name + ';')
        f.write('\n')

        #much efficiency wow </sarcasm>
        modelNames = ['SVM','RF']
        for model_index, model in enumerate(data):
            f.write(modelNames[model_index] + ';')

            for metric in model:
                probs = []
                dists = []
                for tup in metric:
                    probs.append(tup[0])
                    dists.append(tup[1])

                corr = pearsonr(dists, probs)[0]
                f.write(str(corr) + ';')

            f.write('\n')
        f.close()
Exemplo n.º 4
0
    def run(self,person, criterion):
        #http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
        classificatorName = str(self.personLoader.classificator.name)

        #load all features & keep them in memory
        y = load('global_y_per_person' + classificatorName +'_p' + str(person))
        if y == None:
            print('[Warn] Rebuilding cache')
            X, y = self.personLoader.load(person)
            dump(X,'global_X_per_person_p' + str(person))
            dump(y,'global_y_per_person' + classificatorName + '_p' + str(person))
        else:
            X = load('global_X_per_person_p' +str(person))

        #grow forest
        forest = RandomForestClassifier(
            n_estimators=5000,
            max_features='auto',
            criterion=criterion,
            n_jobs=-1,
            random_state=0
        )

        normalize(X,copy=False)

        #fit forest
        forest.fit(X,y)

        #get importances
        importances = forest.feature_importances_
        std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0)

        indices = np.argsort(importances)[::-1]
        featNames = self.personLoader.featureExtractor.getFeatureNames()

        return {
                'classificatorName'  : classificatorName,
                'featNames'          : featNames,
                'importances'        : importances,
                'std'                : std,
                'indices'            : indices, #[ index of first, index of second] ...
                'criterion'          : criterion
                }
Exemplo n.º 5
0
    def probVsDim(self):

        data = [ [] for j in range(len(self.accs)) ]

        for metric_index, metric in enumerate(self.accs):
            # [best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont]
            for (pred_lbl, probs, test_lbl, value) in zip(metric[-4], metric[-3], metric[-2], metric[-1]):
                if pred_lbl == test_lbl:
                    data[metric_index].append([
                            probs[pred_lbl],
                            np.abs(value - 5)
                        ])

        dump(data, "corrData", path=self.ddpad)

        f = open(self.rpad + "corrs.csv", 'w')

        # header line
        f.write('metric;')
        for name in self.metricnames:
            f.write(name + ';')
        f.write('\n')

        # much efficiency wow </sarcasm>
        f.write('SVM' + ';')

        for metric in data:
            metric = np.array(metric)

            probs = metric[:,0]
            dists = metric[:,1]

            corr = pearsonr(dists, probs)[0]
            f.write(str(corr) + ';')

        f.write('\n')
        f.close()
Exemplo n.º 6
0
    def run(self,criterion):
        #http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
        classificatorName = str(self.personLoader.classificator.name)
        featNames = np.array(self.personLoader.featureExtractor.getFeatureNames())

        #load all features & keep them in memory
        y = load('global_y_allpersons' + classificatorName)
        if y == None:
            print('[Warn] Rebuilding cache')
            X, y = self.personLoader.load()
            dump(X,'global_X_allpersons')
            dump(y,'global_y_allpersons' + classificatorName)
        else:
            X = load('global_X_allpersons')

        self.getIntermediateResult(criterion,X,y,'all features')

        #step1
        importances, std, indices = self.step1(criterion,X,y)
        #step2
        indices = self.step2(indices)
        self.getIntermediateResult(criterion,X[:,indices],y,'80%')

        #step 3 add features one by one
        acc_list, best_count, best_metric = self.step3(criterion,indices,X,y,10,featNames)



        return {
                'classificatorName'  : classificatorName,
                'featNames'          : self.personLoader.featureExtractor.getFeatureNames(),
                'global_importances' : importances,
                'global_std'         : std,
                'global_indices'     : indices, #[ index of first, index of second] ...
                'criterion'          : criterion
                }
Exemplo n.º 7
0
    def run(self):

        for person in range(1, self.stopperson + 1):
            # load person data
            y_cont = load('cont_y_p' + str(person), path=self.ddpad)
            if y_cont == None:
                print('[Warn] Rebuilding cache -  person ' + str(person))
                personLdr = personLoader.NoTestsetLoader(self.classifier, self.featExtr)

                X, y_cont = personLdr.load(person)

                dump(X, 'X_p' + str(person), path=self.ddpad)
                dump(y_cont, 'cont_y_p' + str(person), path=self.ddpad)
            else:
                X = load('X_p' + str(person), path=self.ddpad)

            # manual Feature standardization
            X = X - np.average(X, axis=0)
            X = np.true_divide(X, np.std(X, axis=0))

            self.X[person - 1] = np.array(X)
            self.y_cont[person - 1] = np.array(y_cont)


            y_disc = np.array(y_cont)
            y_disc[y_disc <= 5] = 0
            y_disc[y_disc > 5 ] = 1
            self.y_disc[person - 1] = y_disc

        self.results = self.getMetrics()
        self.genReport()

        self.accs = self.getAccs()

        self.probVsDim()
        self.genAccReport()
Exemplo n.º 8
0
    def run(self):
        #create 32 classifiers
        print('initialising the 32 classifiers ...')
        stop_person = 33

        if stop_person < 33:
            print("[warn] not using all persons")

        pool = Pool(processes=POOL_SIZE)
        self.classifiers = pool.map( self.initClassifiers, range(1,stop_person) )
        pool.close()
        pool.join()

        to_keep = {'all_orig_featureNames' : self.classifiers[0].personLoader.featureExtractor.getFeatureNames(),
            'criterion': self.classifiers[0].criterion,
            'classificatorName' : self.classifiers[0].personLoader.classificator.name,
            'stop_person' : stop_person,
            'threshold'   : self.threshold
        }

        #step 1: variable ranking: get all importances of the different features
        print('step 1: getting importances')
        importances = load('importances_once')
        if importances == None:
            print("[warn] rebuilding importances")
            pool = Pool(processes=POOL_SIZE)
            importances = np.array( pool.map( self.getImportance, range(1,stop_person) ))
            pool.close()
            pool.join()
            dump(importances, 'importances_once')
        to_keep['all_importances'] = importances


        avg_importances = np.average(importances[:,:,1], axis=0)
        std_importances = [ np.std(importances[:,i,1]) for i in range(len(importances[0]))]
        to_keep['avg_importances'] = avg_importances
        to_keep['std_importances'] = std_importances

        #step 2: elimintaion: remove everything below threshold, a.k.a. keep everything above the threshold
        print('step 2: filtering features -  threshold')
        indexes_to_keep = [i for i, val in enumerate(avg_importances) if val > self.threshold]

        for c in self.classifiers: c.filterFeatures(indexes_to_keep)
        avg_importances = avg_importances[indexes_to_keep] #cleanup memory

        to_keep['step1_featureNames'] = self.classifiers[0].personLoader.featureExtractor.getFeatureNames()
        to_keep['step1_avg_importances'] = avg_importances

        #sort features
        indices = np.array(np.argsort(avg_importances)[::-1])
        for c in self.classifiers: c.filterFeatures(indices)

        #add features one by one and select smallest, lowest oob model
        print('building tree')
        highest_oob_score = 0
        highest_index = 1
        oob_scores = []
        for i in range(1,len(indices)+1):
            self.count = i

            pool = Pool(processes=POOL_SIZE)
            oob_errors = pool.map( self.getOOBErrors, range(1,stop_person) )
            pool.close()
            pool.join()
            avg_oob = np.average(oob_errors, axis=0)

            print("feat Count: " + str(i) + " - error: " + str(avg_oob))
            oob_scores.append(avg_oob)

            if avg_oob > highest_oob_score:
                #TODO std
                highest_oob_score = avg_oob
                highest_index = i

        to_keep['step2_oob_scores'] = oob_scores

        #select smallest tree with lowest error => lowest index & lowest error
        for c in self.classifiers: c.filterFeatures( [c for c in range(highest_index)] )
        to_keep['step3_size'] = highest_index
        to_keep['step3_oob']  = highest_oob_score

        print("selected tree size:" + str(highest_index) + ' - oob: ' + str(highest_oob_score))

        print('final building phase')
        #restart with an empty tree and add features one by one, only keep features that decrease the error with a certain threshold
        prev_oob = 0
        used_features = []
        used_accs = []
        for i in range(highest_index):
            self.count = i + 1

            pool = Pool(processes=POOL_SIZE)
            oob_errors = pool.map( self.getOOBErrors, range(1,stop_person) )
            pool.close()
            pool.join()

            avg_oob = np.average(oob_errors, axis=0)
            print("feat Count: " + str(i) + " - error: " + str(avg_oob))

            if avg_oob - self.threshold > prev_oob or prev_oob == 0:
                prev_oob = avg_oob
                used_features.append(i)
                used_accs = oob_errors

        to_keep['step4_used_accs'] = used_accs
        to_keep['step4_features']  = used_features
        for c in self.classifiers: c.filterFeatures( [c for c in range(highest_index)] )
        to_keep['step4_featureNames'] = self.classifiers[0].personLoader.featureExtractor.getFeatureNames()

        dump(to_keep,'to_keep')
        return to_keep
Exemplo n.º 9
0
    def getAccs(self, person):
        print('gettings accs for person ' + str(person))

        to_ret = load('accs_p' + str(person), path = self.ddpad)
        if to_ret == None:

            # load person data
            # load all features & keep them in memory
            y = load('cont_y_p' + str(person), path=self.ddpad)
            if y == None:
                print('[Warn] Rebuilding cache -  person ' + str(person))
                personLdr = personLoader.NoTestsetLoader(self.classifier, self.featExtr)

                X, y = personLdr.load(person)

                dump(X, 'X_p' + str(person), path=self.ddpad)
                dump(y, 'cont_y_p' + str(person), path=self.ddpad)
            else:
                X = load('X_p' + str(person), path=self.ddpad)

            y = np.array(y)


            # manual Feature standardization
            X = X - np.average(X, axis=0)
            X = np.true_divide(X, np.std(X, axis=0))

            #train test set
            # Train / testset
            X, X_test, y, y_test_cont = train_test_split(X,y,test_size=10, random_state=17)

            y[y <= 5] = 0
            y[y >  5] = 1

            y_test = np.array(y_test_cont)
            y_test[y_test <= 5] = 0
            y_test[y_test > 5] = 1

            to_ret = []

            for model_index, model in enumerate([
                #SVC(kernel='linear'),
                SVC(kernel='rbf')

                #KNN(n_neighbors=3),
                #KNN(n_neighbors=5),
                #KNN(n_neighbors=7),

                #lda needs two features

                #RandomForestClassifier(
                #    n_estimators=1000,
                #    max_features='auto',
                #    criterion='gini',
                #    n_jobs=-1,
                #)
            ]):
                model_to_ret = []
                for metric in self.results[person-1]:
                    featNames = np.array(self.featExtr.getFeatureNames()) #take clean copy

                    #sort features
                    indices = np.array(np.argsort(metric)[::-1])
                    #take top threshold
                    indices = indices[:self.threshold]

                    #apply
                    X_model = np.array(X[:,indices])
                    X_model_test = np.array(X_test[:,indices])
                    featNames = featNames[indices]

                    best_feat, best_featNames = [], []
                    all_scores, all_stds = [],[]
                    best_score, best_std = 0, 0
                    for i in range(self.threshold):
                        to_keep = best_feat[:]
                        to_keep.append(i)

                        X_temp = np.array(X_model[:,to_keep])

                        # get scores
                        run_scores = []
                        for tr, te in KFold(n=len(X_temp), n_folds=5, shuffle=True, random_state=17):
                            model.fit(X_temp[tr], y[tr])
                            run_scores.append(self.accuracy(model.predict(X_temp[te]), y[te]))

                        new_score = np.average(run_scores)
                        new_std = np.std(run_scores)

                        all_scores.append(new_score)
                        all_stds.append(new_std)

                        # better?
                        if new_score - new_std > best_score - best_std:
                            best_score = new_score
                            best_std = new_std
                            best_feat = to_keep
                            best_featNames.append(featNames[i])

                    #get test score
                    if model_index == 0:
                        test_model = SVC(kernel='rbf', probability=True)
                        test_model.fit(X_model[:,best_feat], y)

                        X_model_test = np.array(X_model_test[:,best_feat])
                        test_pred = test_model.predict(X_model_test)
                        test_prob = test_model.predict_proba(X_model_test)

                        test_acc = self.accuracy(test_model.predict(X_model_test), y_test)
                    else:
                        test_model = RandomForestClassifier(
                            n_estimators=2000,
                            max_features='auto',
                            criterion='gini',
                            n_jobs=-1
                        )
                        test_model.fit(X_model[:, best_feat], y)

                        X_model_test = np.array(X_model_test[:, best_feat])
                        test_pred = test_model.predict(X_model_test)
                        test_prob = test_model.predict_proba(X_model_test)

                        test_acc = self.accuracy(test_model.predict(X_model_test), y_test)

                    model_to_ret.append([best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test, y_test_cont])
                to_ret.append(model_to_ret)

            dump(to_ret, 'accs_p' + str(person), path = self.ddpad)

        return to_ret
Exemplo n.º 10
0
    def runPers(self,person):
        #load person data
        pers_results = load('pers_res_p' + str(person), path=self.ddpad)
        if pers_results == None:
            pers_results = []

            # load all features & keep them in memory
            y_cont = load('cont_y_p' + str(person), path=self.ddpad)
            if y_cont == None:
                print('[Warn] Rebuilding cache -  person ' + str(person))
                personLdr = personLoader.NoTestsetLoader(self.classifier, self.featExtr)

                X, y_cont = personLdr.load(person)

                dump(X, 'X_p' + str(person), path=self.ddpad)
                dump(y_cont, 'cont_y_p' + str(person), path=self.ddpad)
            else:
                X = load('X_p' + str(person), path=self.ddpad)

            y_disc = np.array(y_cont)
            y_disc[y_disc <= 5] = 0
            y_disc[y_disc > 5] = 1

            # manual Feature standardization
            X = X - np.average(X, axis=0)
            X = np.true_divide(X, np.std(X, axis=0))

            #pearson
            corr = []
            for index in range(len(X[0])):
                corr.append( pearsonr(X[:, index], y_cont)[0] )
            pers_results.append(corr)

            #Mut inf
            #dcorr
            mi = []
            dcorr = []
            for feature in np.transpose(X):
                # normalized mutual information
                c_xy = np.histogram2d(feature, y_cont, 2)[0]
                entX = entropy(feature, y_cont)
                entY = entropy(feature, y_cont)
                nMutInf = mutual_info_score(None, None, contingency=c_xy) / float(np.sqrt(entX * entY))
                mi.append(nMutInf)

                # Distance Correlation
                dc, dr, dvx, dvy = self.dcov_all(feature, y_cont)
                dcorr.append(dr)

            pers_results.append(mi)
            pers_results.append(dcorr)

            #Linear Regression
            lr = LinearRegression(n_jobs=-1)
            lr.fit(X, y_cont)
            pers_results.append(lr.coef_)

            #Lasso Regression
            alphas = [0.03, 0.1, 0.3, 1, 3, 10]
            best_alpha = 0.01
            best_acc = 0
            for train_index, test_index in KFold(len(y_cont), n_folds=5):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y_cont[train_index], y_cont[test_index]

                lasso = Lasso(alpha=best_alpha)
                lasso.fit(X_train, y_train)
                pred = lasso.predict(X_test)
                best_acc += self.accuracy(pred, y_cont)
            best_acc /= float(5)

            for alpha in alphas:
                acc = 0
                for train_index, test_index in KFold(len(y_cont), n_folds=5):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y_cont[train_index], y_cont[test_index]

                    lasso = Lasso(alpha=alpha)
                    lasso.fit(X_train, y_train)
                    pred = lasso.predict(X_test)
                    acc += self.accuracy(pred, y_test)

                acc /= float(5)
                if acc > best_acc:
                    best_acc = acc
                    best_alpha = alpha

            lasso = Lasso(alpha=best_alpha)
            lasso.fit(X, y_cont)
            pers_results.append(lasso.coef_)

            #Ridge Regression
            alphas = [0.03, 0.1, 0.3, 1, 3, 10]
            best_alpha = 0.01
            best_acc = 0
            for train_index, test_index in KFold(len(y_cont), n_folds=5):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y_cont[train_index], y_cont[test_index]

                ridge = Ridge(alpha=best_alpha)
                ridge.fit(X_train, y_train)
                pred = ridge.predict(X_test)
                best_acc += self.accuracy(pred, y_test)
            best_acc /= float(5)

            for alpha in alphas:
                acc = 0
                for train_index, test_index in KFold(len(y_cont), n_folds=5):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y_cont[train_index], y_cont[test_index]

                    ridge = Ridge(alpha=alpha)
                    ridge.fit(X_train, y_train)
                    pred = lasso.predict(X_test)
                    acc += self.accuracy(pred, y_test)

                acc /= float(5)
                if acc > best_acc:
                    best_acc = acc
                    best_alpha = alpha

            ridge = Ridge(alpha=best_alpha)
            ridge.fit(X, y_cont)
            pers_results.append(ridge.coef_)

            #SVM
            clf = SVC(kernel='linear')
            clf.fit(X, y_disc)
            svm_weights = (clf.coef_ ** 2).sum(axis=0)
            svm_weights /= float(svm_weights.max())
            pers_results.append(svm_weights)

            #Random Forests
            #rf importances
            #grow forestµ

            importances = []
            for run in range(self.runs):

                forest = RandomForestClassifier(
                    n_estimators=2000,
                    max_features='auto',
                    criterion='gini',
                    n_jobs=-1,
                )
                forest.fit(X,y_disc)
                importances.append(forest.feature_importances_)
                #stds.append( np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) )


            pers_results.append(np.average(importances, axis=0))
            pers_results.append(np.std(importances, axis=0))

            #ANOVA
            anova = SelectKBest(f_regression, k=self.threshold)
            anova.fit(X,y_disc)
            selected_features = anova.get_support()
            pers_results.append(selected_features)

            #Linear Discriminant Analysis
            lda = LinearDiscriminantAnalysis(n_components=1)
            lda.fit(X,y_disc)
            pers_results.append(lda.coef_[0])

            #Principal Component Analysis
            pca = PCA(n_components=1)
            pca.fit(X)
            pers_results.append(pca.components_[0])

            #absolute values
            pers_results = np.absolute(np.array(pers_results))

            dump(pers_results, 'pers_res_p' + str(person), path=self.ddpad)

        return np.array(pers_results)
Exemplo n.º 11
0
if __name__ == '__main__':

    stop_person = 33
    if stop_person < 33:
        print('[warn] not using all persons!')

    # lda
    lda_results = load('results_valence_lda')
    if lda_results == None:
        print('[warn] rebuilding valence lda results cache')
        pool = Pool(processes=POOL_SIZE)
        lda_results = pool.map(ldaRankings, range(1, stop_person))
        pool.close()
        pool.join()
        dump(lda_results, 'results_valence_lda')

    lda_results = np.array(lda_results)
    train_accs = np.array(lda_results[:, 0])
    test_accs = np.array(lda_results[:, 1])
    genLDAReport(train_accs, test_accs)

    results = load('results_valence')
    if results == None:
        print('[warn] rebuilding valence results cache')
        pool = Pool(processes=POOL_SIZE)
        results = pool.map(getPersonRankings, range(1, stop_person))
#        results = pool.map(ldaRankings, range(1, stop_person))
        pool.close()
        pool.join()
        dump(results,'results_valence')
        + str(person)
        + "interpretation - score: "
        + str(score_inter)
        + "("
        + str(std_inter)
        + ")prediction - score: "
        + str(score_pred)
        + " - "
        + str(std_pred)
    )

    to_ret = [
        [featCount_inter, score_inter, std_inter, featureNames_inter],
        [len(indices_pred), score_pred, std_pred, featureNames_pred],
    ]

    dump(to_ret, "rf_P" + str(person))

    return to_ret


if __name__ == "__main__":
    pool = Pool(processes=POOL_SIZE)
    results = pool.map(RFPerson, range(1, STOPPERSON + 1))
    pool.close()
    pool.join()

    dump(results, "RF_pers_specific")

    pprint(results)
Exemplo n.º 13
0
    def getAccs(self):
        to_ret = load('accs_all', path=self.ddpad)
        if to_ret == None:
            # Train / testset
            X, X_test, y, y_test_cont = train_test_split(self.X, self.y_cont,test_size=8, random_state=17)
            X = np.array(X)
            X_test = np.array(X_test)
            y = np.array(y)
            y_test_cont = np.array(y_test_cont)

            y[y <= 5] = 0
            y[y >  5] = 1

            y_test = np.array(y_test_cont)
            y_test[y_test <= 5] = 0
            y_test[y_test > 5] = 1

            to_ret = []
            model = SVC(kernel='rbf', probability=True)
            for mindex, metric in enumerate(self.results):
                print('model' + str(0) + ' - metric' + str(mindex))
                featNames = np.array(self.featExtr.getFeatureNames()) #take clean copy

                #sort features
                indices = np.array(np.argsort(metric)[::-1])
                #take top threshold
                indices = indices[:self.threshold]

                #old struct
                if mindex == 0:
                    X, y = self.fixStructure(X, y)
                    junk, y_test_cont = self.fixStructure(np.array(X_test), y_test_cont)
                    X_test, y_test = self.fixStructure(X_test, y_test)


                #Filter features
                X_model = np.array(X[:,indices])
                featNames = featNames[indices]

                best_feat, best_featNames = [], []
                all_scores, all_stds = [],[]
                best_score, best_std = 0, 0
                for i in range(self.threshold):
                    to_keep = best_feat[:]
                    to_keep.append(i)

                    X_temp = np.array(X_model[:,to_keep])

                    # get scores
                    run_scores = []

                    X_temp, y = self.reverseFixStructure(X_temp, y)
                    for tr, te in KFold(n=len(X_temp), n_folds=5, shuffle=True, random_state=17):
                        X_t,  y_t  = self.fixStructure(X_temp[tr], y[tr])
                        X_te, y_te = self.fixStructure(X_temp[te], y[te])
                        model.fit(X_t, y_t)
                        run_scores.append(self.accuracy(model.predict(X_te), y_te))

                    X_temp, y = self.fixStructure(X_temp, y)

                    new_score = np.average(run_scores)
                    new_std = np.std(run_scores)

                    all_scores.append(new_score)
                    all_stds.append(new_std)

                    # better?
                    if new_score - new_std > best_score - best_std:
                        best_score = new_score
                        best_std = new_std
                        best_feat = to_keep
                        best_featNames.append(featNames[i])

                #get test score => old struct :D
                model.fit(X_model[:,best_feat], y)

                X_model_test = np.array(X_test[:, best_feat])
                test_pred = model.predict(X_model_test)
                test_prob = model.predict_proba(X_model_test)

                test_acc = self.accuracy(test_pred, y_test)

                to_ret.append([best_feat, best_featNames, best_score, best_std, all_scores, all_stds, indices, test_acc, test_pred, test_prob, y_test,
                     y_test_cont])

            X, y = self.reverseFixStructure(X, y)
            X_test, y_test = self.reverseFixStructure(X_test, y_test)

            dump(to_ret, 'accs_all', path = self.ddpad)

        return to_ret
Exemplo n.º 14
0
    def getMetrics(self):

        metrics = load("all_metrics", path=self.ddpad)
        if metrics == None:
            X, y_cont = self.fixStructure(self.X, self.y_cont)
            y_disc = np.array(y_cont)
            y_disc[y_disc <= 5] = 0
            y_disc[y_disc > 5] = 1

            metrics = []

            #pearson
            corr = []
            for index in range(len(X[0])):
                corr.append( pearsonr(X[:, index], y_cont)[0] )
            metrics.append(corr)

            #Mut inf
            #dcorr
            mi = []
            dcorr = []
            for feature in np.transpose(X):
                # normalized mutual information
                c_xy = np.histogram2d(feature, y_cont, 2)[0]
                entX = entropy(feature, y_cont)
                entY = entropy(feature, y_cont)
                nMutInf = mutual_info_score(None, None, contingency=c_xy) / float(np.sqrt(entX * entY))
                mi.append(nMutInf)

                # Distance Correlation
                dc, dr, dvx, dvy = self.dcov_all(feature, y_cont)
                dcorr.append(dr)

            metrics.append(mi)
            metrics.append(dcorr)

            #Linear Regression
            lr = LinearRegression(n_jobs=-1)
            lr.fit(X, y_cont)
            metrics.append(lr.coef_)

            #Lasso Regression
            alphas = [0.03, 0.1, 0.3, 1, 3, 10]
            best_alpha = 0.01
            best_acc = 0
            for train_index, test_index in KFold(len(y_cont), n_folds=5):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y_cont[train_index], y_cont[test_index]

                lasso = Lasso(alpha=best_alpha)
                lasso.fit(X_train, y_train)
                pred = lasso.predict(X_test)
                best_acc += self.accuracy(pred, y_cont)
            best_acc /= float(5)

            for alpha in alphas:
                acc = 0
                for train_index, test_index in KFold(len(y_cont), n_folds=5):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y_cont[train_index], y_cont[test_index]

                    lasso = Lasso(alpha=alpha)
                    lasso.fit(X_train, y_train)
                    pred = lasso.predict(X_test)
                    acc += self.accuracy(pred, y_test)

                acc /= float(5)
                if acc > best_acc:
                    best_acc = acc
                    best_alpha = alpha

            lasso = Lasso(alpha=best_alpha)
            lasso.fit(X, y_cont)
            metrics.append(lasso.coef_)

            #Ridge Regression
            alphas = [0.03, 0.1, 0.3, 1, 3, 10]
            best_alpha = 0.01
            best_acc = 0
            for train_index, test_index in KFold(len(y_cont), n_folds=5):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y_cont[train_index], y_cont[test_index]

                ridge = Ridge(alpha=best_alpha)
                ridge.fit(X_train, y_train)
                pred = ridge.predict(X_test)
                best_acc += self.accuracy(pred, y_test)
            best_acc /= float(5)

            for alpha in alphas:
                acc = 0
                for train_index, test_index in KFold(len(y_cont), n_folds=5):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y_cont[train_index], y_cont[test_index]

                    ridge = Ridge(alpha=alpha)
                    ridge.fit(X_train, y_train)
                    pred = lasso.predict(X_test)
                    acc += self.accuracy(pred, y_test)

                acc /= float(5)
                if acc > best_acc:
                    best_acc = acc
                    best_alpha = alpha

            ridge = Ridge(alpha=best_alpha)
            ridge.fit(X, y_cont)
            metrics.append(ridge.coef_)

            #SVM
            clf = SVC(kernel='linear')
            clf.fit(X, y_disc)
            svm_weights = (clf.coef_ ** 2).sum(axis=0)
            svm_weights /= float(svm_weights.max())
            metrics.append(svm_weights)

            #Random Forests
            #rf importances
            #grow forest

            importances = []
            for run in range(self.runs):
                forest = RandomForestClassifier(
                    n_estimators=2000,
                    max_features='auto',
                    criterion='gini',
                    n_jobs=-1,
                )
                forest.fit(X, y_disc)
                importances.append(forest.feature_importances_)
                # stds.append( np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) )

            metrics.append(np.average(importances, axis=0))
            metrics.append(np.std(importances, axis=0))

            X, y_disc = self.reverseFixStructure(X, y_disc)

            forest.fit(X,y_disc)
            importances, stds = forest.getImportance()
            metrics.append(importances)
            metrics.append(stds)

            X, y_disc = self.fixStructure(X,y_disc)

            #ANOVA
            anova = SelectKBest(f_regression, k=self.threshold)
            anova.fit(X,y_disc)
            selected_features = anova.get_support()
            metrics.append(selected_features)

            #Linear Discriminant Analysis
            lda = LinearDiscriminantAnalysis(n_components=1)
            lda.fit(X,y_disc)
            metrics.append(lda.coef_[0])

            #Principal Component Analysis
            pca = PCA(n_components=1)
            pca.fit(X)
            metrics.append(pca.components_[0])

            #absolute values
            metrics = np.absolute(np.array(metrics))

            dump(metrics, 'all_metrics', path=self.ddpad)

        return np.array(metrics)
Exemplo n.º 15
0
def ldaRankings(person):
    FOLDS = 5

    # load all features & keep them in memory
    y_cont = load('cont_y_p' + str(person))
    if y_cont == None:
        print('[Warn] Rebuilding cache -  person ' + str(person))
        classificator = Classificators.ContValenceClassificator()
        featExtr = getFeatures()
        personLdr = personLoader.NoTestsetLoader(classificator, featExtr)

        X, y_cont = personLdr.load(person)

        dump(X, 'X_p' + str(person))
        dump(y_cont, 'cont_y_p' + str(person))
    else:
        X = load('X_p' + str(person))

    X = np.array(X)
    y_cont = np.array(y_cont)
    y_disc = y_cont
    y_disc[y_disc <= 5] = 0
    y_disc[y_disc > 5] = 1

    for index, val in enumerate(np.std(X, axis=0)):
        if val == 0:
            print('warning zero std for feature index: ', index, ' (', personLoader.featureExtractor.getFeatureNames()[index])

    # manual Feature standardization
    X = X - np.average(X, axis=0)
    X = np.true_divide(X, np.std(X, axis=0))

    feat_test_error = []
    feat_train_error = []
    X_temp = X[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]] #phy only
    feat_test_error.append(0)
    feat_train_error.append(0)
    for train_index, test_index in KFold(len(y_disc), n_folds=FOLDS, random_state=17, shuffle=True):
        X_train, X_test = X_temp[train_index], X_temp[test_index]
        y_train, y_test = y_disc[train_index], y_disc[test_index]

        clf = LDA(shrinkage='auto', solver='lsqr')
        clf.fit(X_train, y_train)

        feat_train_error[0] += accuracy(clf.predict(X_train), y_train) / float(FOLDS)
        feat_test_error[0] += accuracy(clf.predict(X_test)  , y_test ) / float(FOLDS)

    print("train: " + str(feat_train_error[0]) + " test: " + str(feat_test_error[0]))

    for feat_index in range(10,30):#len(X[0,:])):
        index = feat_index - 9
        X_temp = X[:, [0,1,2,3,4,5,6,7,8,9,feat_index]]

        feat_test_error.append(0)
        feat_train_error.append(0)
        for train_index, test_index in KFold(len(y_disc), n_folds=FOLDS, random_state=17, shuffle=True):
            X_train, X_test = X_temp[train_index], X_temp[test_index]
            y_train, y_test = y_disc[train_index], y_disc[test_index]

            clf = LDA(shrinkage='auto',solver='lsqr')
            clf.fit(X_train, y_train)

            feat_train_error[index] += accuracy(clf.predict(X_train), y_train) / float(FOLDS)
            feat_test_error[index]  += accuracy(clf.predict(X_test) , y_test ) / float(FOLDS)

        print("train: " + str(feat_train_error[index]) + " test: " + str(feat_test_error[index]))

    return [feat_test_error, feat_train_error]
Exemplo n.º 16
0
def getPersonRankings(person):
    #load all features & keep them in memory
    y_cont = load('cont_y_p' + str(person))
    if y_cont == None:
        print('[Warn] Rebuilding cache -  person ' + str(person))
        classificator = Classificators.ContValenceClassificator()
        featExtr = getFeatures()
        personLdr = personLoader.NoTestsetLoader(classificator, featExtr)

        X, y_cont = personLdr.load(person)

        dump(X,'X_p' + str(person))
        dump(y_cont,'cont_y_p' + str(person))
    else:
        X = load('X_p' +str(person))

    X = np.array(X)
    y_cont = np.array(y_cont)
    y_disc = y_cont
    y_disc[ y_disc <= 5 ] = 0
    y_disc[ y_disc >  5 ] = 1

    for index,val in enumerate(np.std(X,axis=0)):
        if val == 0:
            print('warning zero std for feature index: ', index, ' (', personLoader.featureExtractor.getFeatureNames()[index])

    #manual Feature standardization
    X = X - np.average(X,axis=0)
    X = np.true_divide(X, np.std(X,axis=0) )

    #statistical tests
    #get pearson
    corr = []
    for index in range(len(X[0])):
        corr.append( pearsonr(X[:, index], y_cont) )

    #model based:
    #normal regression
    lr = LinearRegression(n_jobs=-1)
    lr.fit(X, y_cont)
    lr_scores = lr.coef_

    #l1 regression
    alphas = [0.03,0.1,0.3,1,3,10]
    best_alpha = 0.01
    best_acc = 0
    for train_index, test_index in KFold(len(y_cont), n_folds=5):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_cont[train_index], y_cont[test_index]

        lasso = Lasso(alpha=best_alpha)
        lasso.fit(X_train,y_train)
        pred = lasso.predict(X_test)
        best_acc += accuracy(pred,y_cont)
    best_acc /= float(5)

    for alpha in alphas:
        acc = 0
        for train_index, test_index in KFold(len(y_cont), n_folds=5):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y_cont[train_index], y_cont[test_index]

            lasso = Lasso(alpha=alpha)
            lasso.fit(X_train,y_train)
            pred = lasso.predict(X_test)
            acc += accuracy(pred,y_test)

        acc /= float(5)
        if acc > best_acc:
            best_acc = acc
            best_alpha = alpha

    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X, y_cont)
    l1_scores = lasso.coef_

    #l2 regression
    alphas = [0.03,0.1,0.3,1,3,10]
    best_alpha = 0.01
    best_acc = 0
    for train_index, test_index in KFold(len(y_cont), n_folds=5):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_cont[train_index], y_cont[test_index]

        ridge = Ridge(alpha=best_alpha)
        ridge.fit(X_train,y_train)
        pred = ridge.predict(X_test)
        best_acc += accuracy(pred,y_test)
    best_acc /= float(5)

    for alpha in alphas:
        acc = 0
        for train_index, test_index in KFold(len(y_cont), n_folds=5):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y_cont[train_index], y_cont[test_index]

            ridge = Ridge(alpha=alpha)
            ridge.fit(X_train,y_train)
            pred = lasso.predict(X_test)
            acc += accuracy(pred,y_test)

        acc /= float(5)
        if acc > best_acc:
            best_acc = acc
            best_alpha = alpha

    ridge = Ridge(alpha=best_alpha)
    ridge.fit(X, y_cont)
    l2_scores = ridge.coef_

    #svm coefficients
    clf = svm.SVC(kernel='linear')
    clf.fit(X, y_disc)
    svm_weights = (clf.coef_ ** 2).sum(axis=0)
    svm_weights /= float(svm_weights.max())

    #rf importances
    #grow forest
    forest = RandomForestClassifier(
        n_estimators=3000,
        max_features='auto',
        criterion='gini',
        n_jobs=-1,
    )
    forest.fit(X,y_disc)
    #get importances
    importances = forest.feature_importances_
    #std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0)

    # pca coef
    pca = PCA(n_components=1)
    pca.fit(X)
    pca_coef = pca.components_[0]


    # [pearson_r, mutual inf, max inf, dist, l1 coef, l2 coef, svm coef, rf importances, coord search]
    #featExtr = getFeatures()
    #featnames = featExtr.getFeatureNames()
    pers_results = []
    for corr, lr_scores, l1_scores, l2_scores, svm_weights, importances, pca_coef in zip(corr, lr_scores, l1_scores, l2_scores, svm_weights, importances, pca_coef):
        pers_results.append([np.abs(corr[0]), np.abs(lr_scores), np.abs(l1_scores), np.abs(l2_scores), np.abs(svm_weights), np.abs(importances), np.abs(pca_coef)])

    return pers_results#, featExtr.featureExtrs
Exemplo n.º 17
0
    # run model
    results = model.run()

    return results
def arousalWorker(criterion,treecount,threshold):
    featExtr = getFeatures()

    # create classificator
    classificator = Classificators.ArousalClassificator()

    # create personloader
    personLdr = personLoader.NoTestsetLoader(classificator, featExtr)

    # run model
    model = models.RFModel(personLoader=personLdr, criterion=criterion, treeCount=treecount, threshold=threshold)

    return results

if __name__ == '__main__':
    treeCount = 2000
    threshold = 0.002

    reporter = reporters.HTMLRFModelReporter()

    results = load('to_keep')
    if results == None:
        results = valenceWorker('gini',treeCount,threshold)
        print("[warn] rebuilding cache")
        dump(results,'to_keep')

    reporter.genReport(results)
def RFPerson(person):
    print("person: " + str(person))

    # load X , y
    # load all features & keep them in memory
    featExtr = getFeatures()
    featureNames = np.array(featExtr.getFeatureNames())

    y_cont = load("cont_y_p" + str(person))
    if y_cont == None:
        print("[Warn] Rebuilding cache -  person " + str(person))
        X, y_cont = personLoader.NoTestsetLoader(
            classificator=Classificators.ContValenceClassificator(), featExtractor=featExtr
        ).load(person)

        dump(X, "X_p" + str(person))
        dump(y_cont, "cont_y_p" + str(person))
    else:
        X = load("X_p" + str(person))

    y_disc = np.array(y_cont)
    y_disc[y_disc <= 5] = 0
    y_disc[y_disc > 5] = 1

    # manual Feature standardization
    X = X - np.average(X, axis=0)
    X = np.true_divide(X, np.std(X, axis=0))

    # step 1 determine importances using RF forest
    indices_step1, featureNames_step1 = step1(X, y_disc, featureNames)
    featureNames = np.array(featureNames_step1)
    indices = np.array(indices_step1)

    # filter features (X) based on the results from step 1
    X = X[:, indices]

    # step 2 - interpretation
    featCount_inter, score_inter, std_inter = step2_interpretation(X, y_disc, featureNames)
    indices_inter = indices[:featCount_inter]
    featureNames_inter = featureNames[indices_inter]

    # step 2 - prediction
    indices_pred, score_pred, std_pred = step2_prediction(X, y_disc, featureNames)
    featureNames_pred = featureNames[indices_pred]

    print(
        "["
        + str(person)
        + "interpretation - score: "
        + str(score_inter)
        + "("
        + str(std_inter)
        + ")prediction - score: "
        + str(score_pred)
        + " - "
        + str(std_pred)
    )

    to_ret = [
        [featCount_inter, score_inter, std_inter, featureNames_inter],
        [len(indices_pred), score_pred, std_pred, featureNames_pred],
    ]

    dump(to_ret, "rf_P" + str(person))

    return to_ret