예제 #1
0
 def get_prediction_res(self, model_path, kernel):
     X_train = np.load(os.path.join(model_path, 'X_train_tree_%d.npy' % int(kernel)))
     X_dev = np.load(os.path.join(model_path, 'X_dev_tree_%d.npy' % int(kernel)))
     y_train = np.load(os.path.join(model_path, 'label_train.npy'))
     y_dev = np.load(os.path.join(model_path, 'label_dev.npy'))
     random_forest = RandomForest('permutation_test', X_train, y_train, X_dev, y_dev, test=True)
     random_forest.run()
     _, y_pred = random_forest.evaluate()
     return y_pred
예제 #2
0
    def FV_RF(self):
        print("\nrunning Random Forest on Fisher Vectors")
        ae = AutoEncoder('fv_gmm', 0)

        with smart_open(os.path.join(ae.save_dir, 'model_list.txt'),
                        'rb',
                        encoding='utf-8') as model_path:
            for line_no, line in enumerate(model_path):
                line = str(line).replace('\n', '')
                print(line_no, '\t', line[65:])
                feature_name = line[65:] + '_%d' % self.kernel

                if os.path.isfile(
                        os.path.join(
                            line, 'X_train_tree_%d.npy' %
                            self.kernel)) and os.path.isfile(
                                os.path.join(
                                    line, 'X_dev_tree_%d.npy' % self.kernel)):
                    X_train = np.load(
                        os.path.join(line,
                                     'X_train_tree_%d.npy' % self.kernel))
                    X_dev = np.load(
                        os.path.join(line, 'X_dev_tree_%d.npy' % self.kernel))
                    y_train = np.load(os.path.join(line, 'label_train.npy'))
                    y_dev = np.load(os.path.join(line, 'label_dev.npy'))

                    print(X_train.shape, X_dev.shape)

                    random_forest = RandomForest(feature_name,
                                                 X_train,
                                                 y_train,
                                                 X_dev,
                                                 y_dev,
                                                 test=False)
                    random_forest.run()
                    y_pred_train, y_pred_dev = random_forest.evaluate()
                    get_UAR(y_pred_train,
                            y_train,
                            np.array([]),
                            'RF',
                            feature_name,
                            'single',
                            train_set=True,
                            test=False)
                    get_UAR(y_pred_dev,
                            y_dev,
                            np.array([]),
                            'RF',
                            feature_name,
                            'single',
                            test=False)
예제 #3
0
    def run_MFCC(self):
        """run classifier on MFCC feature (single modality)
        """
        print("\nbuilding a classifier on MFCC features (both frame-level and session-level)")
        X_train, y_train, train_inst, X_dev, y_dev, dev_inst = load_proc_baseline_feature('MFCC', verbose=True)

        if self.model_name == 'RF_cv':
            y_train, y_dev = np.ravel(y_train), np.ravel(y_dev)
            train_inst, dev_inst = np.ravel(train_inst), np.ravel(dev_inst)
            
            X = np.vstack((X_train, X_dev))
            y = np.hstack((y_train, y_dev))
            inst = np.hstack((train_inst, dev_inst))
            assert len(X) == len(y) == len(inst)
            cv_ids = k_fold_cv(len(X))
            cv_res = []
            for (ids_train, ids_dev) in cv_ids:
                X_train = X[ids_train]
                y_train = y[ids_train]
                X_dev = X[ids_dev]
                y_dev = y[ids_dev]
                dev_inst = inst[ids_dev]
                RF_MFCC = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test)
                RF_MFCC.run()
                y_pred_train, y_pred_dev = RF_MFCC.evaluate()
                _, session_res = get_UAR(y_pred_dev, y_dev, dev_inst, self.model_name, self.feature_name, 'baseline', baseline=True, test=True)
                cv_res.append(session_res)
            save_cv_results(cv_res, self.model_name, self.feature_name, 'baseline')

        print("\nupsampling training data to address class imbalance")
        X_train, y_train, train_inst = upsample(X_train, y_train, train_inst)
        print("\nobtaining sparse matrix for better classification")
        # X_train = sp.csr_matrix(np.vstack((X_train, X_dev)))
        # X_dev = sp.csr_matrix(X_dev)
        # y_train = np.hstack((y_train, y_dev))
        X_train, X_dev = sp.csr_matrix(X_train), sp.csr_matrix(X_dev)

        if self.model_name == 'SVM':
            SVM_MFCC = LinearSVM(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test)
            SVM_MFCC.run()
            y_pred_train, y_pred_dev = SVM_MFCC.evaluate()
        elif self.model_name == 'RF':
            RF_MFCC = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test)
            RF_MFCC.run()
            y_pred_train, y_pred_dev = RF_MFCC.evaluate()
        
        get_UAR(y_pred_train, y_train, train_inst, self.model_name, self.feature_name, 'baseline', baseline=True, train_set=True, test=self.test)
        get_UAR(y_pred_dev, y_dev, dev_inst, self.model_name, self.feature_name, 'baseline', baseline=True, test=self.test)
        if not self.test:
            get_post_probability(y_pred_dev, y_dev, dev_inst, np.array([]), self.model_name, self.feature_name)
예제 #4
0
    def RF(self):
        print(
            "\nrunning RF on features selected with RF with doc2vec embeddings"
        )

        feature_path = smart_open('./pre-trained/fusion/feature_list.txt',
                                  'rb',
                                  encoding='utf-8')
        feature_list = []
        for _, line in enumerate(feature_path):
            feature_list.append(str(line).replace('\n', ''))

        for _ in range(3):
            for feature in feature_list:
                _, _, y_dev, y_train = load_label()
                y_train = y_train.astype('int')
                y_dev = y_dev.astype('int')

                X_train = np.load(
                    os.path.join('pre-trained', 'fusion', feature,
                                 'X_train.npy'))
                X_dev = np.load(
                    os.path.join('pre-trained', 'fusion', feature,
                                 'X_dev.npy'))

                random_forest = RandomForest(feature,
                                             X_train,
                                             y_train,
                                             X_dev,
                                             y_dev,
                                             test=False)
                random_forest.run()
                y_pred_train, y_pred_dev = random_forest.evaluate()
                get_UAR(y_pred_train,
                        y_train,
                        np.array([]),
                        'RF',
                        feature,
                        'multiple',
                        train_set=True,
                        test=False)
                get_UAR(y_pred_dev,
                        y_dev,
                        np.array([]),
                        'RF',
                        feature,
                        'multiple',
                        test=False)
예제 #5
0
    def test_random_forest(self):
        import numpy as np
        from sklearn import datasets
        iris = datasets.load_iris()

        # load the iris dataset
        # NOTE that we only use first two features for 2-d plot
        X = iris.data[:,:2]
        y = iris.target
        indices = np.random.permutation(len(X))
        test_size = 15
        X_train = X[indices[:-test_size]]
        y_train = y[indices[:-test_size]]
        X_test = X[indices[-test_size:]]
        y_test = y[indices[-test_size:]]

        random_forest = RandomForest('IRIS', X_train, y_train, X_test, y_test, test=True)
        random_forest.run()
예제 #6
0
    def TEXT_RF(self):
        print("\nrunning Random Forest on document embeddings")

        text2vec = Text2Vec()

        with smart_open(os.path.join(
                text2vec.model_config['doc2vec']['save_dir'],
                'model_list.txt'),
                        'rb',
                        encoding='utf-8') as model_path:
            for line_no, line in enumerate(model_path):
                line = str(line).replace('\n', '')
                print(line_no, '\t', line[68:])
                X_train = np.load(os.path.join(line, 'vectors_train.npy'))
                X_dev = np.load(os.path.join(line, 'vectors_dev.npy'))
                y_train = np.load(os.path.join(line, 'labels_train.npy'))
                y_dev = np.load(os.path.join(line, 'labels_dev.npy'))
                y_train = np.ravel(y_train)
                y_dev = np.ravel(y_dev)
                random_forest = RandomForest(line[68:],
                                             X_train,
                                             y_train,
                                             X_dev,
                                             y_dev,
                                             baseline=False)
                random_forest.run()
                y_pred_train, y_pred_dev = random_forest.evaluate()
                get_UAR(y_pred_train,
                        y_train,
                        np.array([]),
                        'RF',
                        line[68:],
                        'single',
                        baseline=False,
                        train_set=True)
                get_UAR(y_pred_dev,
                        y_dev,
                        np.array([]),
                        'RF',
                        line[68:],
                        'single',
                        baseline=False)
예제 #7
0
 def test_text2vec(self):
     sample = Text2Vec(build_on_corpus=False)
     sample.build_model()
     sample.train_model()
     sample.infer_embedding('train')
     sample.infer_embedding('dev')
     sample.load_model()
     X_train, y_train = sample.load_embedding('train')
     X_dev, y_dev = sample.load_embedding('dev')
     random_forest = RandomForest('text',
                                  X_train,
                                  y_train,
                                  X_dev,
                                  y_dev,
                                  test=True)
     random_forest.run()
     random_forest.evaluate()
     sample.evaluate_model()
예제 #8
0
    def run_AU(self):
        """run classifier on AU feature (single modality)
        """
        print("\nbuilding a classifier on AU features (already session-level)")
        X_train, y_train, _, X_dev, y_dev, _ = load_proc_baseline_feature('AU', verbose=True)

        if self.model_name == 'RF_cv':
            X = np.vstack((X_train, X_dev))
            y = np.hstack((y_train, y_dev))
            assert len(X) == len(y)
            cv_ids = k_fold_cv(len(X))
            cv_res = []
            for (ids_train, ids_dev) in cv_ids:
                X_train = X[ids_train]
                y_train = y[ids_train]
                X_dev = X[ids_dev]
                y_dev = y[ids_dev]
                RF_MFCC = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test)
                RF_MFCC.run()
                y_pred_train, y_pred_dev = RF_MFCC.evaluate()
                _, session_res = get_UAR(y_pred_dev, y_dev, np.array([]), self.model_name, self.feature_name, 'baseline', baseline=True, test=True)
                cv_res.append(session_res)
            save_cv_results(cv_res, self.model_name, self.feature_name, 'baseline')

        print("\nupsampling training data to address class imbalance")
        X_train, y_train, _ = upsample(X_train, y_train, np.array([]))
        print("\nobtaining sparse matrix for better classification")
        # X_train = sp.csr_matrix(np.vstack((X_train, X_dev)))
        # X_dev = sp.csr_matrix(X_dev)
        # y_train = np.hstack((y_train, y_dev))
        X_train, X_dev = sp.csr_matrix(X_train), sp.csr_matrix(X_dev)
        
        if self.model_name == 'SVM':
            SVM_AU = LinearSVM(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test)
            SVM_AU.run()
            y_pred_train, y_pred_dev = SVM_AU.evaluate()
            session_prob = SVM_AU.get_session_probability()
        elif self.model_name == 'RF':
            RF_AU = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test)
            RF_AU.run()
            y_pred_train, y_pred_dev = RF_AU.evaluate()
            session_prob = RF_AU.get_session_probability()
        
        get_UAR(y_pred_train, y_train, np.array([]), self.model_name, self.feature_name, 'baseline', baseline=True, train_set=True, test=self.test)
        get_UAR(y_pred_dev, y_dev, np.array([]), self.model_name, self.feature_name, 'baseline', baseline=True, test=self.test)
예제 #9
0
    def RF_CV(self):
        print(
            "\nrunning RF on features selected with RF with doc2vec embeddings"
        )

        feature_path = smart_open('./pre-trained/fusion/feature_list.txt',
                                  'rb',
                                  encoding='utf-8')
        feature_list = []
        for _, line in enumerate(feature_path):
            feature_list.append(str(line).replace('\n', ''))

        from sklearn.metrics import precision_recall_fscore_support

        cv_results_UAR = dict()
        cv_results_UAP = dict()

        for feature in feature_list:
            cv_results_UAR[feature] = []
            cv_results_UAP[feature] = []

            _, _, y_dev, y_train = load_label()
            y_train = y_train.astype('int')
            y_dev = y_dev.astype('int')

            X_train = np.load(
                os.path.join('pre-trained', 'fusion', feature, 'X_train.npy'))
            X_dev = np.load(
                os.path.join('pre-trained', 'fusion', feature, 'X_dev.npy'))

            X = np.vstack((X_train, X_dev))
            y = np.hstack((y_train, y_dev))

            cv_ids = k_fold_cv(len(X))

            for cv_id in cv_ids:
                X_train = X[cv_id[0]]
                y_train = y[cv_id[0]]
                X_dev = X[cv_id[1]]
                y_dev = y[cv_id[1]]

                print('train on %d test on %d' % (len(y_train), len(y_dev)))

                random_forest = RandomForest(feature,
                                             X_train,
                                             y_train,
                                             X_dev,
                                             y_dev,
                                             test=False)
                random_forest.run()
                _, y_pred = random_forest.evaluate()
                precision, recall, _, _ = precision_recall_fscore_support(
                    y_dev, y_pred, average='macro')
                cv_results_UAR[feature].append(recall)
                cv_results_UAP[feature].append(precision)

            assert len(cv_results_UAR[feature]) == len(
                cv_results_UAP[feature]) == 10

        with open(os.path.join('results', 'cross-validation.json'),
                  'a+',
                  encoding='utf-8') as outfile:
            json.dump(cv_results_UAR, outfile)
            json.dump(cv_results_UAP, outfile)
예제 #10
0
async def predict(input: PredictRequest,
                  clf: RandomForest = Depends(get_model)):
    X = np.array(input.data)
    y_pred = clf.predict(X)
    result = PredictResponse(data=y_pred.tolist())
    return result
예제 #11
0
def get_model():
    clf = RandomForest(model_name="rf_201209")
    clf.load()
    return clf