Пример #1
0
    def run_cross_validation(train_test_model_class,
                             model_param,
                             results_or_df,
                             train_indices,
                             test_indices):
        """
        Simple cross validation.
        :param train_test_model_class:
        :param model_param:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param train_indices:
        :param test_indices:
        :return:
        """
        xys_train = TrainTestModel.get_xys_from_results(results_or_df, train_indices)
        xs_test = TrainTestModel.get_xs_from_results(results_or_df, test_indices)
        ys_test = TrainTestModel.get_ys_from_results(results_or_df, test_indices)

        model = train_test_model_class(model_param, None)
        model.train(xys_train)
        stats = model.evaluate(xs_test, ys_test)

        output = {}
        output['stats'] = stats
        output['model'] = model
        output['contentids'] = ys_test['content_id'] # for plotting purpose

        return output
Пример #2
0
    def test_train_predict_libsvmnusvr(self):

        print "test libsvmnusvr train and predict..."

        # libsvmnusvr is bit exact to nusvr

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = LibsvmnusvrTrainTestModel({'norm_type': 'normalize'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.30977055639849227, places=4)

        model = LibsvmnusvrTrainTestModel({'norm_type': 'clip_0to1'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.28066350351974495, places=4)

        model = LibsvmnusvrTrainTestModel({'norm_type': 'clip_minus1to1'},
                                          None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.28651275022085743, places=4)

        model = LibsvmnusvrTrainTestModel({'norm_type': 'none'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.64219197018248542, places=4)
Пример #3
0
    def test_train_save_load_predict_libsvmnusvr(self):

        print "test libsvmnusvr train, save, load and predict..."

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = LibsvmnusvrTrainTestModel({'norm_type': 'normalize'}, None)
        model.train(xys)

        model.to_file(self.model_filename)
        self.assertTrue(os.path.exists(self.model_filename))
        self.assertTrue(os.path.exists(self.model_filename + '.model'))

        loaded_model = LibsvmnusvrTrainTestModel.from_file(
            self.model_filename, None)

        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.30977055639849227, places=4)

        # loaded model generates slight numerical difference
        result = loaded_model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.30977055639849227, places=4)

        model.delete(self.model_filename)
Пример #4
0
    def test_train_predict_libsvmnusvr(self):

        print "test libsvmnusvr train and predict..."

        # libsvmnusvr is bit exact to nusvr

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = LibsvmnusvrTrainTestModel(
            {'norm_type':'normalize'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.30977055639849227)

        model = LibsvmnusvrTrainTestModel(
            {'norm_type':'clip_0to1'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.28066350351974495)

        model = LibsvmnusvrTrainTestModel(
            {'norm_type':'clip_minus1to1'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.28651275022085743)

        model = LibsvmnusvrTrainTestModel(
            {'norm_type':'none'}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.64219197018248542)
Пример #5
0
    def test_train_predict_randomforest(self):

        print "test random forest train and predict..."

        # random forest don't need proper data normalization

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = RandomForestTrainTestModel({'norm_type':'normalize',
                                'random_state': 0}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.32357946626958406)

        model = RandomForestTrainTestModel({'norm_type':'clip_0to1',
                                'random_state': 0}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.33807954580885896)

        model = RandomForestTrainTestModel({'norm_type':'clip_minus1to1',
                                'random_state': 0}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.31798315556627982)

        model = RandomForestTrainTestModel({'norm_type':'none', 'random_state': 0}, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.33660273277405978)
Пример #6
0
    def test_get_xs_ys(self):
        xs = TrainTestModel.get_xs_from_results(self.features, [0, 1, 2])

        self.assertEquals(len(xs['Moment_noref_feature_1st_score']), 3)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_1st_score']),
                                128.26146851380497,
                                places=4)
        self.assertEquals(len(xs['Moment_noref_feature_var_score']), 3)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_var_score']),
                                1569.2395085695462,
                                places=4)

        xs = TrainTestModel.get_xs_from_results(self.features)
        self.assertEquals(len(xs['Moment_noref_feature_1st_score']), 9)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_1st_score']),
                                111.59099599173773,
                                places=4)
        self.assertEquals(len(xs['Moment_noref_feature_var_score']), 9)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_var_score']),
                                1806.8620377229011,
                                places=4)

        ys = TrainTestModel.get_ys_from_results(self.features, [0, 1, 2])
        expected_ys = {
            'label': np.array([2.5, 3.9, 5.0]),
            'content_id': np.array([0, 1, 2])
        }
        self.assertTrue(all(ys['label'] == expected_ys['label']))
        self.assertTrue(all(ys['content_id'] == expected_ys['content_id']))
Пример #7
0
    def test_train_save_load_predict(self):

        print "test train, save, load and predict..."

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = RandomForestTrainTestModel(
            {
                'norm_type': 'normalize',
                'random_state': 0
            }, None)
        model.train(xys)

        model.to_file(self.model_filename)
        self.assertTrue(os.path.exists(self.model_filename))

        loaded_model = RandomForestTrainTestModel.from_file(
            self.model_filename, None)

        result = loaded_model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.32357946626958406, places=4)

        model.delete(self.model_filename)
Пример #8
0
    def test_train_save_load_predict_libsvmnusvr(self):

        print "test libsvmnusvr train, save, load and predict..."

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = LibsvmnusvrTrainTestModel({'norm_type':'normalize'}, None)
        model.train(xys)

        model.to_file(self.model_filename)
        self.assertTrue(os.path.exists(self.model_filename))
        self.assertTrue(os.path.exists(self.model_filename + '.model'))

        loaded_model = LibsvmnusvrTrainTestModel.from_file(self.model_filename, None)

        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'],        0.30977055639849227)

        # loaded model generates slight numerical difference
        result = loaded_model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'],        0.30977055639849227)

        model.delete(self.model_filename)
Пример #9
0
def cv_on_dataset(dataset,
                  feature_param,
                  model_param,
                  ax,
                  result_store,
                  contentid_groups,
                  logger=None,
                  aggregate_method=np.mean):

    assets = read_dataset(dataset)
    kfold = construct_kfold_list(assets, contentid_groups)

    fassembler = FeatureAssembler(
        feature_dict=feature_param.feature_dict,
        feature_option_dict=None,
        assets=assets,
        logger=logger,
        delete_workdir=True,
        result_store=result_store,
        parallelize=True,
        fifo_mode=True,
        # parallelize=False, fifo_mode=False, # VQM
    )
    fassembler.run()
    results = fassembler.results

    for result in results:
        result.set_aggregate_method(aggregate_method)

    # run nested kfold cv for each combintation
    cv_output = ModelCrossValidation.run_kfold_cross_validation(
        TrainTestModel.find_subclass(model_param.model_type),
        model_param.model_param_dict,
        results,
        kfold,
        logger=logger,
    )

    print 'Feature parameters: {}'.format(feature_param.feature_dict)
    print 'Model type: {}'.format(model_param.model_type)
    print 'Model parameters: {}'.format(model_param.model_param_dict)
    print 'Stats: {}'.format(
        TrainTestModel.format_stats(cv_output['aggr_stats']))

    if ax is not None:
        TrainTestModel.plot_scatter(ax, cv_output['aggr_stats'],
                                    cv_output['contentids'])
        ax.set_xlabel('DMOS')
        ax.set_ylabel("Predicted Score")
        ax.grid()
        ax.set_title("Dataset: {dataset}, Model: {model},\n{stats}".format(
            dataset=dataset.dataset_name,
            model=model_param.model_type,
            stats=TrainTestModel.format_stats(cv_output['aggr_stats'])))

    return assets, cv_output
Пример #10
0
    def test_read_xs_ys_from_dataframe(self):

        try:
            import pandas as pd
        except ImportError:
            print 'Warning: import pandas fails. Skip test.'
            return

        try:
            import numpy as np
        except ImportError:
            print 'Warning: import numpy fails. Skip test.'
            return

        feature_df_file = config.ROOT + "/python/test/resource/sample_feature_extraction_results.json"
        feature_df = pd.DataFrame.from_dict(eval(open(feature_df_file, "r").read()))

        xs = TrainTestModel.get_xs_from_dataframe(feature_df, [0, 1, 2])
        expected_xs = { 'ansnr_feat': np.array([46.364271863296779,
                                                42.517841772700201,
                                                35.967123359308225]),
                        'dlm_feat': np.array([ 1.,  1.,  1.]),
                        'ti_feat': np.array([12.632675462694392,
                                             3.7917434352421662,
                                             2.0189066771371684]),
                        'vif_feat': np.array([0.99999999995691546,
                                              0.99999999994743127,
                                              0.9999999999735345])}
        for key in xs: self.assertTrue(all(xs[key] == expected_xs[key]))

        xs = TrainTestModel.get_xs_from_dataframe(feature_df)
        for key in xs: self.assertEquals(len(xs[key]), 300)

        ys = TrainTestModel.get_ys_from_dataframe(feature_df, [0, 1, 2])
        expected_ys = {'label': np.array([4.5333333333333332,
                                          4.7000000000000002,
                                          4.4000000000000004]),
                       'content_id': np.array([0, 1, 10])}
        self.assertTrue(all(ys['label'] == expected_ys['label']))

        xys = TrainTestModel.get_xys_from_dataframe(feature_df, [0, 1, 2])
        expected_xys = { 'ansnr_feat': np.array([46.364271863296779,
                                                 42.517841772700201,
                                                 35.967123359308225]),
                         'dlm_feat': np.array([ 1.,  1.,  1.]),
                         'ti_feat': np.array([12.632675462694392,
                                             3.7917434352421662,
                                             2.0189066771371684]),
                         'vif_feat': np.array([0.99999999995691546,
                                              0.99999999994743127,
                                              0.9999999999735345]),
                         'label': np.array([4.5333333333333332,
                                            4.7000000000000002,
                                            4.4000000000000004]),
                         'content_id': np.array([0, 1, 10])}
        for key in xys: self.assertTrue(all(xys[key] == expected_xys[key]))
Пример #11
0
    def test_get_xs_ys(self):
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df, [0, 1, 2])
        expected_xs = {
            'ansnr_feat':
            np.array(
                [46.364271863296779, 42.517841772700201, 35.967123359308225]),
            'dlm_feat':
            np.array([1., 1., 1.]),
            'ti_feat':
            np.array(
                [12.632675462694392, 3.7917434352421662, 2.0189066771371684]),
            'vif_feat':
            np.array(
                [0.99999999995691546, 0.99999999994743127, 0.9999999999735345])
        }
        for key in xs:
            self.assertTrue(all(xs[key] == expected_xs[key]))

        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df)
        for key in xs:
            self.assertEquals(len(xs[key]), 300)

        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df, [0, 1, 2])
        expected_ys = {
            'label':
            np.array(
                [4.5333333333333332, 4.7000000000000002, 4.4000000000000004]),
            'content_id':
            np.array([0, 1, 10])
        }
        self.assertTrue(all(ys['label'] == expected_ys['label']))

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df, [0, 1, 2])
        expected_xys = {
            'ansnr_feat':
            np.array(
                [46.364271863296779, 42.517841772700201, 35.967123359308225]),
            'dlm_feat':
            np.array([1., 1., 1.]),
            'ti_feat':
            np.array(
                [12.632675462694392, 3.7917434352421662, 2.0189066771371684]),
            'vif_feat':
            np.array(
                [0.99999999995691546, 0.99999999994743127,
                 0.9999999999735345]),
            'label':
            np.array(
                [4.5333333333333332, 4.7000000000000002, 4.4000000000000004]),
            'content_id':
            np.array([0, 1, 10])
        }
        for key in xys:
            self.assertTrue(all(xys[key] == expected_xys[key]))
Пример #12
0
def cv_on_dataset(dataset, feature_param, model_param, ax, result_store,
                  contentid_groups, logger=None, aggregate_method=np.mean):

    assets = read_dataset(dataset)
    kfold = construct_kfold_list(assets, contentid_groups)

    fassembler = FeatureAssembler(
        feature_dict = feature_param.feature_dict,
        feature_option_dict = None,
        assets = assets,
        logger=logger,
        fifo_mode=True,
        delete_workdir=True,
        result_store=result_store,
        parallelize=False
    )
    fassembler.run()
    results = fassembler.results

    for result in results:
        result.set_aggregate_method(aggregate_method)

    # run nested kfold cv for each combintation
    cv_output = ModelCrossValidation.run_kfold_cross_validation(
        TrainTestModel.find_subclass(model_param.model_type),
        model_param.model_param_dict,
        results,
        kfold,
        logger=logger,
    )

    print 'Feature parameters: {}'.format(feature_param.feature_dict)
    print 'Model type: {}'.format(model_param.model_type)
    print 'Model parameters: {}'.format(model_param.model_param_dict)
    print 'Stats: {}'.format(TrainTestModel.format_stats(cv_output['aggr_stats']))

    if ax is not None:
        TrainTestModel.plot_scatter(ax, cv_output['aggr_stats'], cv_output['contentids'])
        ax.set_xlabel('DMOS')
        ax.set_ylabel("Predicted Score")
        ax.grid()
        ax.set_title( "Dataset: {dataset}, Model: {model},\n{stats}".format(
            dataset=dataset.dataset_name,
            model=model_param.model_type,
            stats=TrainTestModel.format_stats(cv_output['aggr_stats'])
        ))

    return assets, cv_output
Пример #13
0
    def test_train_save_load_predict(self):

        xs = MomentRandomForestTrainTestModel.get_xs_from_results(
            self.features)
        ys = MomentRandomForestTrainTestModel.get_ys_from_results(
            self.features)
        xys = MomentRandomForestTrainTestModel.get_xys_from_results(
            self.features)

        # using dis_y only
        del xs['dis_u']
        del xs['dis_v']
        del xys['dis_u']
        del xys['dis_v']

        model = MomentRandomForestTrainTestModel({
            'norm_type': 'normalize',
            'random_state': 0
        })
        model.train(xys)

        model.to_file(self.model_filename)
        self.assertTrue(os.path.exists(self.model_filename))

        loaded_model = TrainTestModel.from_file(self.model_filename)

        result = loaded_model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.17634739353518517, places=4)
Пример #14
0
    def _run_on_asset(self, asset):
        # Override Executor._run_on_asset(self, asset), which runs a
        # FeatureAssembler, collect a feature vector, run
        # TrainTestModel.predict() on it, and return a Result object
        # (in this case, both Executor._run_on_asset(self, asset) and
        # QualityRunner._read_result(self, asset) get bypassed.

        vmaf_fassembler = self._get_vmaf_feature_assembler_instance(asset)
        vmaf_fassembler.run()
        feature_result = vmaf_fassembler.results[0]

        xs = TrainTestModel.get_perframe_xs_from_result(feature_result)

        model = self._load_model()

        ys_pred = model.predict(xs)

        # 'score_clip'
        ys_pred = self.clip_score(model, ys_pred)

        result_dict = {}
        # add all feature result
        result_dict.update(feature_result.result_dict)
        # add quality score
        result_dict[self.get_scores_key()] = ys_pred

        return Result(asset, self.executor_id, result_dict)
Пример #15
0
    def _run_on_asset(self, asset):
        # Override Executor._run_on_asset(self, asset), which runs a
        # FeatureAssembler, collect a feature vector, run
        # TrainTestModel.predict() on it, and return a Result object
        # (in this case, both Executor._run_on_asset(self, asset) and
        # QualityRunner._read_result(self, asset) get bypassed.

        vmaf_fassembler = self._get_vmaf_feature_assembler_instance(asset)
        vmaf_fassembler.run()
        feature_result = vmaf_fassembler.results[0]

        xs = TrainTestModel.get_perframe_xs_from_result(feature_result)

        model = self._load_model()

        ys_pred = model.predict(xs)

        # 'score_clip'
        ys_pred = self.clip_score(model, ys_pred)

        result_dict = {}
        # add all feature result
        result_dict.update(feature_result.result_dict)
        # add quality score
        result_dict[self.get_scores_key()] = ys_pred

        return Result(asset, self.executor_id, result_dict)
Пример #16
0
    def test_train_predict_randomforest(self):

        print "test random forest train and predict..."

        # random forest don't need proper data normalization

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = RandomForestTrainTestModel(
            {
                'norm_type': 'normalize',
                'random_state': 0
            }, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.32357946626958406, places=4)

        model = RandomForestTrainTestModel(
            {
                'norm_type': 'clip_0to1',
                'random_state': 0
            }, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.33807954580885896, places=4)

        model = RandomForestTrainTestModel(
            {
                'norm_type': 'clip_minus1to1',
                'random_state': 0
            }, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.31798315556627982, places=4)

        model = RandomForestTrainTestModel(
            {
                'norm_type': 'none',
                'random_state': 0
            }, None)
        model.train(xys)
        result = model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.33660273277405978, places=4)
Пример #17
0
 def _load_model(self, asset):
     if self.optional_dict is not None \
             and 'model_filepath' in self.optional_dict \
             and self.optional_dict['model_filepath'] is not None:
         model_filepath = self.optional_dict['model_filepath']
     else:
         model_filepath = self.DEFAULT_MODEL_FILEPATH
     model = TrainTestModel.from_file(model_filepath, self.logger)
     return model
Пример #18
0
 def _load_model(self, asset):
     if self.optional_dict is not None \
             and 'model_filepath' in self.optional_dict \
             and self.optional_dict['model_filepath'] is not None:
         model_filepath = self.optional_dict['model_filepath']
     else:
         model_filepath = self.DEFAULT_MODEL_FILEPATH
     model = TrainTestModel.from_file(model_filepath, self.logger)
     return model
Пример #19
0
 def _load_model(self):
     model_filepath = self.optional_dict['model_filepath'] \
         if (self.optional_dict is not None
             and 'model_filepath' in self.optional_dict
             and self.optional_dict['model_filepath'] is not None
             ) \
         else self.DEFAULT_MODEL_FILEPATH
     model = TrainTestModel.from_file(model_filepath, self.logger)
     return model
Пример #20
0
 def _load_model(self):
     model_filepath = self.optional_dict['model_filepath'] \
         if (self.optional_dict is not None
             and 'model_filepath' in self.optional_dict
             and self.optional_dict['model_filepath'] is not None
             ) \
         else self.DEFAULT_MODEL_FILEPATH
     model = TrainTestModel.from_file(model_filepath, self.logger)
     return model
Пример #21
0
    def test_get_xs_ys(self):
        xs = TrainTestModel.get_xs_from_results(self.features, [0, 1, 2])

        self.assertEquals(len(xs['Moment_noref_feature_1st_score']), 3)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_1st_score']), 128.26146851380497, places=4)
        self.assertEquals(len(xs['Moment_noref_feature_var_score']), 3)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_var_score']), 1569.2395085695462, places=4)

        xs = TrainTestModel.get_xs_from_results(self.features)
        self.assertEquals(len(xs['Moment_noref_feature_1st_score']), 9)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_1st_score']), 111.59099599173773, places=4)
        self.assertEquals(len(xs['Moment_noref_feature_var_score']), 9)
        self.assertAlmostEquals(np.mean(xs['Moment_noref_feature_var_score']), 1806.8620377229011, places=4)

        ys = TrainTestModel.get_ys_from_results(self.features, [0, 1, 2])
        expected_ys = {'label': np.array([2.5, 3.9, 5.0]),
                       'content_id': np.array([0, 1, 2])}
        self.assertTrue(all(ys['label'] == expected_ys['label']))
        self.assertTrue(all(ys['content_id'] == expected_ys['content_id']))
Пример #22
0
    def test_train_save_load_predict(self):

        print "test train, save, load and predict..."

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df.iloc[:-50])
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df.iloc[-50:])
        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df.iloc[-50:])

        model = RandomForestTrainTestModel({'norm_type':'normalize', 'random_state':0}, None)
        model.train(xys)

        model.to_file(self.model_filename)
        self.assertTrue(os.path.exists(self.model_filename))

        loaded_model = RandomForestTrainTestModel.from_file(self.model_filename, None)

        result = loaded_model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.32357946626958406)

        model.delete(self.model_filename)
Пример #23
0
    def test_get_xs_ys(self):
        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df, [0, 1, 2])
        expected_xs = { 'ansnr_feat': np.array([46.364271863296779,
                                                42.517841772700201,
                                                35.967123359308225]),
                        'dlm_feat': np.array([ 1.,  1.,  1.]),
                        'ti_feat': np.array([12.632675462694392,
                                             3.7917434352421662,
                                             2.0189066771371684]),
                        'vif_feat': np.array([0.99999999995691546,
                                              0.99999999994743127,
                                              0.9999999999735345])}
        for key in xs: self.assertTrue(all(xs[key] == expected_xs[key]))

        xs = TrainTestModel.get_xs_from_dataframe(self.feature_df)
        for key in xs: self.assertEquals(len(xs[key]), 300)

        ys = TrainTestModel.get_ys_from_dataframe(self.feature_df, [0, 1, 2])
        expected_ys = {'label': np.array([4.5333333333333332,
                                          4.7000000000000002,
                                          4.4000000000000004]),
                       'content_id': np.array([0, 1, 10])}
        self.assertTrue(all(ys['label'] == expected_ys['label']))

        xys = TrainTestModel.get_xys_from_dataframe(self.feature_df, [0, 1, 2])
        expected_xys = { 'ansnr_feat': np.array([46.364271863296779,
                                                 42.517841772700201,
                                                 35.967123359308225]),
                         'dlm_feat': np.array([ 1.,  1.,  1.]),
                         'ti_feat': np.array([12.632675462694392,
                                             3.7917434352421662,
                                             2.0189066771371684]),
                         'vif_feat': np.array([0.99999999995691546,
                                              0.99999999994743127,
                                              0.9999999999735345]),
                         'label': np.array([4.5333333333333332,
                                            4.7000000000000002,
                                            4.4000000000000004]),
                         'content_id': np.array([0, 1, 10])}
        for key in xys: self.assertTrue(all(xys[key] == expected_xys[key]))
Пример #24
0
    def test_train_save_load_predict(self):

        xs = MomentRandomForestTrainTestModel.get_xs_from_results(self.features)
        ys = MomentRandomForestTrainTestModel.get_ys_from_results(self.features)
        xys = MomentRandomForestTrainTestModel.get_xys_from_results(self.features)

        # using dis_y only
        del xs['dis_u']
        del xs['dis_v']
        del xys['dis_u']
        del xys['dis_v']

        model = MomentRandomForestTrainTestModel({'norm_type':'normalize', 'random_state':0})
        model.train(xys)

        model.to_file(self.model_filename)
        self.assertTrue(os.path.exists(self.model_filename))

        loaded_model = TrainTestModel.from_file(self.model_filename)

        result = loaded_model.evaluate(xs, ys)
        self.assertAlmostEquals(result['RMSE'], 0.17634739353518517, places=4)
Пример #25
0
def test_on_dataset(test_dataset, runner_class, ax,
                    result_store, model_filepath,
                    parallelize=True, fifo_mode=True,
                    aggregate_method=np.mean):

    test_assets = read_dataset(test_dataset)

    optional_dict = {
        'model_filepath':model_filepath
    }

    # construct an quality runner object to assert assets only
    runner = runner_class(test_assets,
                 None,
                 fifo_mode=fifo_mode,
                 delete_workdir=True,
                 result_store=result_store,
                 optional_dict=optional_dict,
                 )
    try:
        # run
        _, results = run_executors_in_parallel(
            runner_class,
            test_assets,
            fifo_mode=fifo_mode,
            delete_workdir=True,
            parallelize=parallelize,
            result_store=result_store,
            optional_dict=optional_dict,
        )

        for result in results:
            result.set_aggregate_method(aggregate_method)

        # plot
        groundtruths = map(lambda asset: asset.groundtruth, test_assets)
        predictions = map(lambda result: result[runner_class.get_score_key()], results)
        stats = TrainTestModel.get_stats(groundtruths, predictions)

        print 'Stats on testing data: {}'.format(TrainTestModel.format_stats(stats))

        if ax is not None:
            content_ids = map(lambda asset: asset.content_id, test_assets)
            TrainTestModel.plot_scatter(ax, stats, content_ids)
            ax.set_xlabel('DMOS')
            ax.set_ylabel("Predicted Score")
            ax.grid()
            # ax.set_title( "Dataset: {dataset}, Runner: {runner}\n{stats}".format(
            ax.set_title( "{runner}\n{stats}".format(
                dataset=test_assets[0].dataset,
                # runner=results[0].executor_id,
                runner=runner_class.TYPE,
                stats=TrainTestModel.format_stats(stats),
                # stats="",
                # stats=TrainTestModel.format_stats3(stats),
            ))

    except Exception as e:
        print "Error: " + str(e)

    return test_assets, results
Пример #26
0
def train_test_vmaf_on_dataset(train_dataset, test_dataset,
                               feature_param, model_param,
                               train_ax, test_ax, result_store,
                               parallelize=True, logger=None, fifo_mode=True,
                               output_model_filepath=None,
                               aggregate_method=np.mean,
                               **kwargs):

    train_assets = read_dataset(train_dataset, **kwargs)
    train_raw_assets = None
    try:
        for train_asset in train_assets:
            assert train_asset.groundtruth is not None
    except AssertionError:
        # no groundtruth, try do subjective modeling
        subj_model_class = kwargs['subj_model_class'] if 'subj_model_class' in kwargs and kwargs['subj_model_class'] is not None else DmosModel
        subjective_model = subj_model_class(RawDatasetReader(train_dataset))
        subjective_model.run_modeling(**kwargs)
        train_dataset_aggregate = subjective_model.to_aggregated_dataset(**kwargs)
        train_raw_assets = train_assets
        train_assets = read_dataset(train_dataset_aggregate, **kwargs)

    train_fassembler = FeatureAssembler(
        feature_dict = feature_param.feature_dict,
        feature_option_dict = None,
        assets = train_assets,
        logger=logger,
        fifo_mode=fifo_mode,
        delete_workdir=True,
        result_store=result_store,
        optional_dict=None,
        optional_dict2=None,
        parallelize=parallelize,
    )
    train_fassembler.run()
    train_features = train_fassembler.results

    for result in train_features:
        result.set_score_aggregate_method(aggregate_method)

    model_type = model_param.model_type
    model_param_dict = model_param.model_param_dict

    model_class = TrainTestModel.find_subclass(model_type)

    train_xys = model_class.get_xys_from_results(train_features)
    train_xs = model_class.get_xs_from_results(train_features)
    train_ys = model_class.get_ys_from_results(train_features)

    model = model_class(model_param_dict, logger)

    model.train(train_xys)

    # append additional information to model before saving, so that
    # VmafQualityRunner can read and process
    model.append_info('feature_dict', feature_param.feature_dict)
    if 'score_clip' in model_param_dict:
        VmafQualityRunner.set_clip_score(model, model_param_dict['score_clip'])
    if 'score_transform' in model_param_dict:
        VmafQualityRunner.set_transform_score(model, model_param_dict['score_transform'])

    train_ys_pred = VmafQualityRunner.predict_with_model(model, train_xs, **kwargs)

    raw_groundtruths = None if train_raw_assets is None else \
        map(lambda asset: asset.raw_groundtruth, train_raw_assets)

    train_stats = model.get_stats(train_ys['label'], train_ys_pred,
                                  ys_label_raw=raw_groundtruths)

    log = 'Stats on training data: {}'.format(model.format_stats(train_stats))
    if logger:
        logger.info(log)
    else:
        print log

    # save model
    if output_model_filepath is not None:
        model.to_file(output_model_filepath)

    if train_ax is not None:
        train_content_ids = map(lambda asset: asset.content_id, train_assets)
        model_class.plot_scatter(train_ax, train_stats, train_content_ids)
        train_ax.set_xlabel('True Score')
        train_ax.set_ylabel("Predicted Score")
        train_ax.grid()
        train_ax.set_title( "Dataset: {dataset}, Model: {model}\n{stats}".format(
            dataset=train_dataset.dataset_name,
            model=model.model_id,
            stats=model_class.format_stats(train_stats)
        ))

    # === test model on test dataset ===

    if test_dataset is None:
        test_assets = None
        test_stats = None
        test_fassembler = None
    else:
        test_assets = read_dataset(test_dataset, **kwargs)
        test_raw_assets = None
        try:
            for test_asset in test_assets:
                assert test_asset.groundtruth is not None
        except AssertionError:
            # no groundtruth, try do subjective modeling
            subj_model_class = kwargs['subj_model_class'] if 'subj_model_class' in kwargs and kwargs['subj_model_class'] is not None else DmosModel
            subjective_model = subj_model_class(RawDatasetReader(test_dataset))
            subjective_model.run_modeling(**kwargs)
            test_dataset_aggregate = subjective_model.to_aggregated_dataset(**kwargs)
            test_raw_assets = test_assets
            test_assets = read_dataset(test_dataset_aggregate, **kwargs)

        test_fassembler = FeatureAssembler(
            feature_dict = feature_param.feature_dict,
            feature_option_dict = None,
            assets = test_assets,
            logger=logger,
            fifo_mode=fifo_mode,
            delete_workdir=True,
            result_store=result_store,
            optional_dict=None,
            optional_dict2=None,
            parallelize=True,
        )
        test_fassembler.run()
        test_features = test_fassembler.results

        for result in test_features:
            result.set_score_aggregate_method(aggregate_method)

        test_xs = model_class.get_xs_from_results(test_features)
        test_ys = model_class.get_ys_from_results(test_features)

        test_ys_pred = VmafQualityRunner.predict_with_model(model, test_xs, **kwargs)

        raw_groundtruths = None if test_raw_assets is None else \
            map(lambda asset: asset.raw_groundtruth, test_raw_assets)

        test_stats = model_class.get_stats(test_ys['label'], test_ys_pred,
                                           ys_label_raw=raw_groundtruths)

        log = 'Stats on testing data: {}'.format(model_class.format_stats(test_stats))
        if logger:
            logger.info(log)
        else:
            print log

        if test_ax is not None:
            test_content_ids = map(lambda asset: asset.content_id, test_assets)
            model_class.plot_scatter(test_ax, test_stats, test_content_ids)
            test_ax.set_xlabel('True Score')
            test_ax.set_ylabel("Predicted Score")
            test_ax.grid()
            test_ax.set_title( "Dataset: {dataset}, Model: {model}\n{stats}".format(
                dataset=test_dataset.dataset_name,
                model=model.model_id,
                stats=model_class.format_stats(test_stats)
            ))

    return train_fassembler, train_assets, train_stats, \
           test_fassembler, test_assets, test_stats, model
Пример #27
0
def train_test_on_dataset(train_dataset, test_dataset,
                          feature_param, model_param,
                          train_ax, test_ax, result_store,
                          parallelize=True, logger=None, fifo_mode=True,
                          output_model_filepath=None):

    train_assets = read_dataset(train_dataset)
    train_fassembler = FeatureAssembler(
        feature_dict = feature_param.feature_dict,
        feature_option_dict = None,
        assets = train_assets,
        logger=logger,
        fifo_mode=fifo_mode,
        delete_workdir=True,
        result_store=result_store,
        parallelize=parallelize,
    )
    train_fassembler.run()
    train_features = train_fassembler.results

    train_xys = TrainTestModel.get_xys_from_results(train_features)
    train_xs = TrainTestModel.get_xs_from_results(train_features)
    train_ys = TrainTestModel.get_ys_from_results(train_features)

    model_type = model_param.model_type
    model_param_dict = model_param.model_param_dict

    model_class = TrainTestModel.find_subclass(model_type)
    model = model_class(model_param_dict, logger)

    model.train(train_xys)

    # append additional information to model before saving, so that
    # VmafQualityRunner can read and process
    model.append_info('feature_dict', feature_param.feature_dict)
    if 'score_clip' in model_param_dict:
        VmafQualityRunner.set_clip_score(model, model_param_dict['score_clip'])

    train_ys_pred = model.predict(train_xs)

    # apply instructions indicated in the appended info
    train_ys_pred = VmafQualityRunner.clip_score(model, train_ys_pred)

    train_stats = TrainTestModel.get_stats(train_ys['label'], train_ys_pred)

    if logger:
        logger.info('Stats on training data: {}'.format(TrainTestModel.
                                                        format_stats(train_stats)))

    # save model
    if output_model_filepath is not None:
        model.to_file(output_model_filepath)

    if train_ax is not None:
        train_content_ids = map(lambda asset: asset.content_id, train_assets)
        TrainTestModel.plot_scatter(train_ax, train_stats, train_content_ids)
        train_ax.set_xlabel('DMOS')
        train_ax.set_ylabel("Predicted Score")
        train_ax.grid()
        train_ax.set_title( "Dataset: {dataset}, Model: {model}\n{stats}".format(
            dataset=train_dataset.dataset_name,
            model=model.model_id,
            stats=TrainTestModel.format_stats(train_stats)
        ))

    # === test model on test dataset ===

    if test_dataset is None:
        test_assets = None
        test_stats = None
        test_fassembler = None
    else:
        test_assets = read_dataset(test_dataset)
        test_fassembler = FeatureAssembler(
            feature_dict = feature_param.feature_dict,
            feature_option_dict = None,
            assets = test_assets,
            logger=logger,
            fifo_mode=fifo_mode,
            delete_workdir=True,
            result_store=result_store,
            parallelize=True,
        )
        test_fassembler.run()
        test_features = test_fassembler.results

        test_xs = TrainTestModel.get_xs_from_results(test_features)
        test_ys = TrainTestModel.get_ys_from_results(test_features)

        test_ys_pred = model.predict(test_xs)

        # apply instructions indicated in the appended info
        test_ys_pred = VmafQualityRunner.clip_score(model, test_ys_pred)

        test_stats = TrainTestModel.get_stats(test_ys['label'], test_ys_pred)

        if logger:
            logger.info('Stats on testing data: {}'.format(
                TrainTestModel.format_stats(test_stats)))

        if test_ax is not None:
            test_content_ids = map(lambda asset: asset.content_id, test_assets)
            TrainTestModel.plot_scatter(test_ax, test_stats, test_content_ids)
            test_ax.set_xlabel('DMOS')
            test_ax.set_ylabel("Predicted Score")
            test_ax.grid()
            test_ax.set_title( "Dataset: {dataset}, Model: {model}\n{stats}".format(
                dataset=test_dataset.dataset_name,
                model=model.model_id,
                stats=TrainTestModel.format_stats(test_stats)
            ))

    return train_fassembler, train_assets, train_stats, \
           test_fassembler, test_assets, test_stats
Пример #28
0
def train_test_vmaf_on_dataset(
    train_dataset,
    test_dataset,
    feature_param,
    model_param,
    train_ax,
    test_ax,
    result_store,
    parallelize=True,
    logger=None,
    fifo_mode=True,
    output_model_filepath=None,
    aggregate_method=np.mean,
    **kwargs
):

    train_assets = read_dataset(train_dataset, **kwargs)
    train_raw_assets = None
    try:
        for train_asset in train_assets:
            assert train_asset.groundtruth is not None
    except AssertionError:
        # no groundtruth, try do subjective modeling
        subj_model_class = (
            kwargs["subj_model_class"]
            if "subj_model_class" in kwargs and kwargs["subj_model_class"] is not None
            else DmosModel
        )
        subjective_model = subj_model_class(RawDatasetReader(train_dataset))
        subjective_model.run_modeling(**kwargs)
        train_dataset_aggregate = subjective_model.to_aggregated_dataset(**kwargs)
        train_raw_assets = train_assets
        train_assets = read_dataset(train_dataset_aggregate, **kwargs)

    train_fassembler = FeatureAssembler(
        feature_dict=feature_param.feature_dict,
        feature_option_dict=None,
        assets=train_assets,
        logger=logger,
        fifo_mode=fifo_mode,
        delete_workdir=True,
        result_store=result_store,
        optional_dict=None,
        optional_dict2=None,
        parallelize=parallelize,
    )
    train_fassembler.run()
    train_features = train_fassembler.results

    for result in train_features:
        result.set_score_aggregate_method(aggregate_method)

    model_type = model_param.model_type
    model_param_dict = model_param.model_param_dict

    model_class = TrainTestModel.find_subclass(model_type)

    train_xys = model_class.get_xys_from_results(train_features)
    train_xs = model_class.get_xs_from_results(train_features)
    train_ys = model_class.get_ys_from_results(train_features)

    model = model_class(model_param_dict, logger)

    model.train(train_xys)

    # append additional information to model before saving, so that
    # VmafQualityRunner can read and process
    model.append_info("feature_dict", feature_param.feature_dict)
    if "score_clip" in model_param_dict:
        VmafQualityRunner.set_clip_score(model, model_param_dict["score_clip"])

    train_ys_pred = VmafQualityRunner.predict_with_model(model, train_xs, **kwargs)

    raw_groundtruths = None if train_raw_assets is None else map(lambda asset: asset.raw_groundtruth, train_raw_assets)

    train_stats = model.get_stats(train_ys["label"], train_ys_pred, ys_label_raw=raw_groundtruths)

    log = "Stats on training data: {}".format(model.format_stats(train_stats))
    if logger:
        logger.info(log)
    else:
        print log

    # save model
    if output_model_filepath is not None:
        model.to_file(output_model_filepath)

    if train_ax is not None:
        train_content_ids = map(lambda asset: asset.content_id, train_assets)
        model_class.plot_scatter(train_ax, train_stats, train_content_ids)
        train_ax.set_xlabel("True Score")
        train_ax.set_ylabel("Predicted Score")
        train_ax.grid()
        train_ax.set_title(
            "Dataset: {dataset}, Model: {model}\n{stats}".format(
                dataset=train_dataset.dataset_name, model=model.model_id, stats=model_class.format_stats(train_stats)
            )
        )

    # === test model on test dataset ===

    if test_dataset is None:
        test_assets = None
        test_stats = None
        test_fassembler = None
    else:
        test_assets = read_dataset(test_dataset, **kwargs)
        test_raw_assets = None
        try:
            for test_asset in test_assets:
                assert test_asset.groundtruth is not None
        except AssertionError:
            # no groundtruth, try do subjective modeling
            subj_model_class = (
                kwargs["subj_model_class"]
                if "subj_model_class" in kwargs and kwargs["subj_model_class"] is not None
                else DmosModel
            )
            subjective_model = subj_model_class(RawDatasetReader(test_dataset))
            subjective_model.run_modeling(**kwargs)
            test_dataset_aggregate = subjective_model.to_aggregated_dataset(**kwargs)
            test_raw_assets = test_assets
            test_assets = read_dataset(test_dataset_aggregate, **kwargs)

        test_fassembler = FeatureAssembler(
            feature_dict=feature_param.feature_dict,
            feature_option_dict=None,
            assets=test_assets,
            logger=logger,
            fifo_mode=fifo_mode,
            delete_workdir=True,
            result_store=result_store,
            optional_dict=None,
            optional_dict2=None,
            parallelize=True,
        )
        test_fassembler.run()
        test_features = test_fassembler.results

        for result in test_features:
            result.set_score_aggregate_method(aggregate_method)

        test_xs = model_class.get_xs_from_results(test_features)
        test_ys = model_class.get_ys_from_results(test_features)

        test_ys_pred = VmafQualityRunner.predict_with_model(model, test_xs, **kwargs)

        raw_groundtruths = (
            None if test_raw_assets is None else map(lambda asset: asset.raw_groundtruth, test_raw_assets)
        )

        test_stats = model_class.get_stats(test_ys["label"], test_ys_pred, ys_label_raw=raw_groundtruths)

        log = "Stats on testing data: {}".format(model_class.format_stats(test_stats))
        if logger:
            logger.info(log)
        else:
            print log

        if test_ax is not None:
            test_content_ids = map(lambda asset: asset.content_id, test_assets)
            model_class.plot_scatter(test_ax, test_stats, test_content_ids)
            test_ax.set_xlabel("True Score")
            test_ax.set_ylabel("Predicted Score")
            test_ax.grid()
            test_ax.set_title(
                "Dataset: {dataset}, Model: {model}\n{stats}".format(
                    dataset=test_dataset.dataset_name, model=model.model_id, stats=model_class.format_stats(test_stats)
                )
            )

    return train_fassembler, train_assets, train_stats, test_fassembler, test_assets, test_stats, model
Пример #29
0
    def run_nested_kfold_cross_validation(cls,
                                          train_test_model_class,
                                          model_param_search_range,
                                          results_or_df,
                                          kfold,
                                          search_strategy='grid',
                                          random_search_times=100,
                                          logger=None):
        """
        Nested k-fold cross validation, given hyper-parameter search range. The
        search range is specified in the format of, e.g.:
        {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'],
         'n_estimators':[10, 50],
         'random_state': [0]}
        :param train_test_model_class:
        :param model_param_search_range:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param kfold: if it is an integer, it is the number of folds; if it is
        lists of indices, then each list contains row indices of the dataframe
        selected as one fold
        :param search_strategy: either 'grid' or 'random'
        :return: output
        """

        if isinstance(kfold, (int, long)):
            kfold_type = 'int'
        elif isinstance(kfold, (list, tuple)):
            kfold_type = 'list'
        else:
            assert False, 'kfold must be either a list of lists or an integer.'

        # if input is integer (e.g. 4), reconstruct kfold in list of indices
        # format
        if kfold_type == 'int':
            num_fold = kfold
            dataframe_size = len(results_or_df)
            fold_size = int(floor(dataframe_size / num_fold))
            kfold = []
            for fold in range(num_fold):
                index_start = fold * fold_size
                index_end = min((fold+1)*fold_size, dataframe_size)
                kfold.append(range(index_start, index_end))

        assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \
                                'k-fold cross validation.'

        if search_strategy == 'grid':
            cls._assert_grid_search(model_param_search_range)
            list_model_param = cls._unroll_dict_of_lists(
                model_param_search_range)
        elif search_strategy == 'random':
            cls._assert_random_search(model_param_search_range)
            list_model_param = cls._sample_model_param_list(
                model_param_search_range, random_search_times)
        else:
            assert False, "Unknown search_strategy: {}".format(search_strategy)

        statss = []
        model_params = []
        contentids = []

        for fold in range(len(kfold)):

            if logger: logger.info("Fold {}...".format(fold))

            test_index_range = kfold[fold]
            train_index_range = []
            train_index_range_in_list_of_indices = []
                        # in this case, train_index_range is list of lists
            for train_fold in range(len(kfold)):
                if train_fold != fold:
                    train_index_range += kfold[train_fold]
                    train_index_range_in_list_of_indices.append(kfold[train_fold])

            # iterate through all possible combinations of model_params
            best_model_param = None
            best_stats = None
            for model_param in list_model_param:

                if logger: logger.info("\tModel parameter: {}".format(model_param))

                output = \
                    cls.run_kfold_cross_validation(train_test_model_class,
                                                   model_param,
                                                   results_or_df,
                                                   train_index_range_in_list_of_indices)
                stats = output['aggr_stats']

                if (best_stats is None) or (
                    TrainTestModel.get_objective_score(stats, type='SRCC')
                    >
                    TrainTestModel.get_objective_score(best_stats, type='SRCC')
                ):
                    best_stats = stats
                    best_model_param = model_param

            # run cross validation based on best model parameters
            output_ = cls.run_cross_validation(train_test_model_class,
                                              best_model_param,
                                              results_or_df,
                                              train_index_range,
                                              test_index_range)
            stats_ = output_['stats']

            statss.append(stats_)
            model_params.append(best_model_param)

            contentids += list(output_['contentids'])

        aggr_stats = TrainTestModel.aggregate_stats_list(statss)
        top_model_param, count = cls._find_most_frequent_dict(model_params)

        assert contentids is not None
        output__ = {
            'aggr_stats':aggr_stats,
            'top_model_param':top_model_param,
            'top_ratio':float(count) / len(model_params),
            'statss':statss,
            'model_params':model_params,
            'contentids':contentids,
        }

        return output__
Пример #30
0
    def run_kfold_cross_validation(cls,
                                   train_test_model_class,
                                   model_param,
                                   results_or_df,
                                   kfold,
                                   logger=None):
        """
        Standard k-fold cross validation, given hyper-parameter set model_param
        :param train_test_model_class:
        :param model_param:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param kfold: if it is an integer, it is the number of folds; if it is
        list of indices, then each list contains row indices of the dataframe
        selected as one fold
        :return: output
        """

        if isinstance(kfold, (int, long)):
            kfold_type = 'int'
        elif isinstance(kfold, (list, tuple)):
            kfold_type = 'list'
        else:
            assert False, 'kfold must be either a list of lists or an integer.'

        # if input is integer (e.g. 4), reconstruct kfold in list of indices
        # format
        if kfold_type == 'int':
            num_fold = kfold
            dataframe_size = len(results_or_df)
            fold_size = int(floor(dataframe_size / num_fold))
            kfold = []
            for fold in range(num_fold):
                index_start = fold * fold_size
                index_end = min((fold+1)*fold_size, dataframe_size)
                kfold.append(range(index_start, index_end))

        assert len(kfold) >= 2, 'kfold list must have length >= 2 for k-fold ' \
                                'cross validation.'

        statss = []
        models = []
        contentids = []

        for fold in range(len(kfold)):

            if logger: logger.info("Fold {}...".format(fold))

            test_index_range = kfold[fold]
            train_index_range = []
            for train_fold in range(len(kfold)):
                if train_fold != fold:
                    train_index_range += kfold[train_fold]

            output = cls.run_cross_validation(train_test_model_class,
                                              model_param,
                                              results_or_df,
                                              train_index_range,
                                              test_index_range)

            stats = output['stats']
            model = output['model']

            statss.append(stats)
            models.append(model)

            contentids += list(output['contentids'])

        aggr_stats = TrainTestModel.aggregate_stats_list(statss)

        output = {}
        output['aggr_stats'] = aggr_stats
        output['statss'] = statss
        output['models'] = models

        assert contentids is not None
        output['contentids'] = contentids

        return output
Пример #31
0
    def test_read_xs_ys_from_dataframe(self):

        try:
            import pandas as pd
        except ImportError:
            print 'Warning: import pandas fails. Skip test.'
            return

        try:
            import numpy as np
        except ImportError:
            print 'Warning: import numpy fails. Skip test.'
            return

        feature_df_file = config.ROOT + "/python/test/resource/sample_feature_extraction_results.json"
        feature_df = pd.DataFrame.from_dict(
            eval(open(feature_df_file, "r").read()))

        xs = TrainTestModel.get_xs_from_dataframe(feature_df, [0, 1, 2])
        expected_xs = {
            'ansnr_feat':
            np.array(
                [46.364271863296779, 42.517841772700201, 35.967123359308225]),
            'dlm_feat':
            np.array([1., 1., 1.]),
            'ti_feat':
            np.array(
                [12.632675462694392, 3.7917434352421662, 2.0189066771371684]),
            'vif_feat':
            np.array(
                [0.99999999995691546, 0.99999999994743127, 0.9999999999735345])
        }
        for key in xs:
            self.assertTrue(all(xs[key] == expected_xs[key]))

        xs = TrainTestModel.get_xs_from_dataframe(feature_df)
        for key in xs:
            self.assertEquals(len(xs[key]), 300)

        ys = TrainTestModel.get_ys_from_dataframe(feature_df, [0, 1, 2])
        expected_ys = {
            'label':
            np.array(
                [4.5333333333333332, 4.7000000000000002, 4.4000000000000004]),
            'content_id':
            np.array([0, 1, 10])
        }
        self.assertTrue(all(ys['label'] == expected_ys['label']))

        xys = TrainTestModel.get_xys_from_dataframe(feature_df, [0, 1, 2])
        expected_xys = {
            'ansnr_feat':
            np.array(
                [46.364271863296779, 42.517841772700201, 35.967123359308225]),
            'dlm_feat':
            np.array([1., 1., 1.]),
            'ti_feat':
            np.array(
                [12.632675462694392, 3.7917434352421662, 2.0189066771371684]),
            'vif_feat':
            np.array(
                [0.99999999995691546, 0.99999999994743127,
                 0.9999999999735345]),
            'label':
            np.array(
                [4.5333333333333332, 4.7000000000000002, 4.4000000000000004]),
            'content_id':
            np.array([0, 1, 10])
        }
        for key in xys:
            self.assertTrue(all(xys[key] == expected_xys[key]))
Пример #32
0
def train_test_on_dataset(train_dataset,
                          test_dataset,
                          feature_param,
                          model_param,
                          train_ax,
                          test_ax,
                          result_store,
                          parallelize=True,
                          logger=None,
                          fifo_mode=True,
                          output_model_filepath=None):

    train_assets = read_dataset(train_dataset)
    train_fassembler = FeatureAssembler(
        feature_dict=feature_param.feature_dict,
        feature_option_dict=None,
        assets=train_assets,
        logger=logger,
        fifo_mode=fifo_mode,
        delete_workdir=True,
        result_store=result_store,
        parallelize=parallelize,
    )
    train_fassembler.run()
    train_features = train_fassembler.results

    train_xys = TrainTestModel.get_xys_from_results(train_features)
    train_xs = TrainTestModel.get_xs_from_results(train_features)
    train_ys = TrainTestModel.get_ys_from_results(train_features)

    model_type = model_param.model_type
    model_param_dict = model_param.model_param_dict

    model_class = TrainTestModel.find_subclass(model_type)
    model = model_class(model_param_dict, logger)

    model.train(train_xys)

    # append additional information to model before saving, so that
    # VmafQualityRunner can read and process
    model.append_info('feature_dict', feature_param.feature_dict)
    if 'score_clip' in model_param_dict:
        VmafQualityRunner.set_clip_score(model, model_param_dict['score_clip'])

    train_ys_pred = model.predict(train_xs)

    # apply instructions indicated in the appended info
    train_ys_pred = VmafQualityRunner.clip_score(model, train_ys_pred)

    train_stats = TrainTestModel.get_stats(train_ys['label'], train_ys_pred)

    if logger:
        logger.info('Stats on training data: {}'.format(
            TrainTestModel.format_stats(train_stats)))

    # save model
    if output_model_filepath is not None:
        model.to_file(output_model_filepath)

    if train_ax is not None:
        train_content_ids = map(lambda asset: asset.content_id, train_assets)
        TrainTestModel.plot_scatter(train_ax, train_stats, train_content_ids)
        train_ax.set_xlabel('DMOS')
        train_ax.set_ylabel("Predicted Score")
        train_ax.grid()
        train_ax.set_title(
            "Dataset: {dataset}, Model: {model}\n{stats}".format(
                dataset=train_dataset.dataset_name,
                model=model.model_id,
                stats=TrainTestModel.format_stats(train_stats)))

    # === test model on test dataset ===

    if test_dataset is None:
        test_assets = None
        test_stats = None
        test_fassembler = None
    else:
        test_assets = read_dataset(test_dataset)
        test_fassembler = FeatureAssembler(
            feature_dict=feature_param.feature_dict,
            feature_option_dict=None,
            assets=test_assets,
            logger=logger,
            fifo_mode=fifo_mode,
            delete_workdir=True,
            result_store=result_store,
            parallelize=True,
        )
        test_fassembler.run()
        test_features = test_fassembler.results

        test_xs = TrainTestModel.get_xs_from_results(test_features)
        test_ys = TrainTestModel.get_ys_from_results(test_features)

        test_ys_pred = model.predict(test_xs)

        # apply instructions indicated in the appended info
        test_ys_pred = VmafQualityRunner.clip_score(model, test_ys_pred)

        test_stats = TrainTestModel.get_stats(test_ys['label'], test_ys_pred)

        if logger:
            logger.info('Stats on testing data: {}'.format(
                TrainTestModel.format_stats(test_stats)))

        if test_ax is not None:
            test_content_ids = map(lambda asset: asset.content_id, test_assets)
            TrainTestModel.plot_scatter(test_ax, test_stats, test_content_ids)
            test_ax.set_xlabel('DMOS')
            test_ax.set_ylabel("Predicted Score")
            test_ax.grid()
            test_ax.set_title(
                "Dataset: {dataset}, Model: {model}\n{stats}".format(
                    dataset=test_dataset.dataset_name,
                    model=model.model_id,
                    stats=TrainTestModel.format_stats(test_stats)))

    return train_fassembler, train_assets, train_stats, \
           test_fassembler, test_assets, test_stats
Пример #33
0
def test_on_dataset(test_dataset,
                    runner_class,
                    ax,
                    result_store,
                    model_filepath,
                    parallelize=True,
                    fifo_mode=True,
                    aggregate_method=np.mean):

    test_assets = read_dataset(test_dataset)

    optional_dict = {'model_filepath': model_filepath}

    # construct an quality runner object to assert assets only
    runner = runner_class(
        test_assets,
        None,
        fifo_mode=fifo_mode,
        delete_workdir=True,
        result_store=result_store,
        optional_dict=optional_dict,
    )
    try:
        # run
        _, results = run_executors_in_parallel(
            runner_class,
            test_assets,
            fifo_mode=fifo_mode,
            delete_workdir=True,
            parallelize=parallelize,
            result_store=result_store,
            optional_dict=optional_dict,
        )

        for result in results:
            result.set_aggregate_method(aggregate_method)

        # plot
        groundtruths = map(lambda asset: asset.groundtruth, test_assets)
        predictions = map(lambda result: result[runner_class.get_score_key()],
                          results)
        stats = TrainTestModel.get_stats(groundtruths, predictions)

        print 'Stats on testing data: {}'.format(
            TrainTestModel.format_stats(stats))

        if ax is not None:
            content_ids = map(lambda asset: asset.content_id, test_assets)
            TrainTestModel.plot_scatter(ax, stats, content_ids)
            ax.set_xlabel('DMOS')
            ax.set_ylabel("Predicted Score")
            ax.grid()
            # ax.set_title( "Dataset: {dataset}, Runner: {runner}\n{stats}".format(
            ax.set_title("{runner}\n{stats}".format(
                dataset=test_assets[0].dataset,
                # runner=results[0].executor_id,
                runner=runner_class.TYPE,
                stats=TrainTestModel.format_stats(stats),
                # stats="",
                # stats=TrainTestModel.format_stats3(stats),
            ))

    except Exception as e:
        print "Error: " + str(e)

    return test_assets, results