Пример #1
0
def test_cv():
    df = pd.read_csv(get_path('test_data.csv'), header=[0, 1])
    df = stratified_folds(df, nfolds=3, sortby=('comp', 'SiO2'))

    params = {'n_components': [1, 2, 3], 'scale': [False]}
    paramgrid = list(ParameterGrid(params))

    cv_obj = cv.cv(paramgrid)
    df_out, output, models, modelkeys, predictkeys = cv_obj.do_cv(
        df,
        xcols='wvl',
        ycol=[('comp', 'SiO2')],
        method='PLS',
        yrange=[0, 100],
        calc_path=False,
        alphas=None)

    expected_predicts = [
        56.55707481, 57.93716105, 59.34785052, 60.59708391, 55.83934129,
        56.7456989
    ]
    expected_output_rmsec = [18.6509206, 14.64015186, 13.80182457]

    np.testing.assert_array_almost_equal(
        expected_predicts, np.array(df_out['predict'].iloc[0, :]))
    np.testing.assert_array_almost_equal(expected_output_rmsec,
                                         np.array(output[('cv', 'RMSEC')]))
    assert output.shape == (3, 8)
    assert len(models) == 3
    assert len(modelkeys) == 3
    assert modelkeys[
        0] == 'PLS - SiO2 - (0, 100) {\'n_components\': 1, \'scale\': False}'
    assert len(predictkeys) == 6
    assert predictkeys[
        0] == '"PLS- CV -{\'n_components\': 1, \'scale\': False}"'
Пример #2
0
def test_cv_calc_path():
    df = pd.read_csv(get_path('test_data.csv'), header=[0, 1])
    df = stratified_folds(df, nfolds=3, sortby=('comp', 'SiO2'))

    params = {
        'fit_intercept': [True, False],
        'max_iter': [1000],
        'tol': [1e-3],
        'precompute': [True],
        'copy_X': [True],
        'positive': [True, False],
        'selection': ['random'],
        'random_state': [1]
    }
    alphas = np.logspace(np.log10(0.0000001), np.log10(0.01), num=10)
    paramgrid = list(ParameterGrid(params))

    cv_obj = cv.cv(paramgrid)
    df_out, output, models, modelkeys, predictkeys = cv_obj.do_cv(
        df,
        xcols='wvl',
        ycol=[('comp', 'SiO2')],
        method='LASSO',
        yrange=[0, 100],
        calc_path=True,
        alphas=alphas)

    expected_predicts = [
        57.87064, 57.868983, 57.868983, 57.868983, 57.868983, 59.315111,
        59.315113, 59.315114, 59.315114, 59.315114
    ]
    expected_output_rmsec = [
        18.490365, 18.490365, 18.490365, 18.490365, 18.490365, 7.042796,
        6.986007, 6.967643, 6.959045, 6.953588
    ]

    np.testing.assert_array_almost_equal(
        expected_predicts, np.array(df_out['predict'].iloc[0, 5:15]))
    np.testing.assert_array_almost_equal(
        expected_output_rmsec, np.array(output[('cv', 'RMSEC')].iloc[5:15]))

    assert output.shape == (40, 15)
    assert len(models) == 40
    assert len(modelkeys) == 40
    assert modelkeys[
        0] == 'LASSO - SiO2 - (0, 100) Alpha: 0.01, {\'copy_X\': True, \'fit_intercept\': True, \'max_iter\': 1000, \'positive\': True, \'precompute\': True, \'random_state\': 1, \'selection\': \'random\', \'tol\': 0.001}'
    assert len(predictkeys) == 80
    assert predictkeys[
        0] == '"LASSO - SiO2 - CV - Alpha:0.01 - {\'copy_X\': True, \'fit_intercept\': True, \'max_iter\': 1000, \'positive\': True, \'precompute\': True, \'random_state\': 1, \'selection\': \'random\', \'tol\': 0.001}"'
Пример #3
0
def test_cv_nofolds():
    df = pd.read_csv(get_path('test_data.csv'), header=[0, 1])
    params = {'n_components': [1, 2, 3], 'scale': [False]}
    paramgrid = list(ParameterGrid(params))

    cv_obj = cv.cv(paramgrid)
    results = cv_obj.do_cv(df,
                           xcols='wvl',
                           ycol=[('comp', 'SiO2')],
                           method='PLS',
                           yrange=[0, 100],
                           calc_path=False,
                           alphas=None)
    print(results)
    assert results == 0
Пример #4
0
def test_cv_local_regression():
    df = pd.read_csv(get_path('test_data.csv'), header=[0, 1])
    df = df.iloc[0:20, :]  #make data set smaller so this test runs faster
    df = stratified_folds(df, nfolds=3, sortby=('comp', 'SiO2'))

    params = {
        'n_neighbors': [5, 6],
        'fit_intercept': [True],
        'positive': [False],
        'random_state': [1],
        'tol': [1e-2]
    }
    paramgrid = list(ParameterGrid(params))

    cv_obj = cv.cv(paramgrid)
    df_out, output, models, modelkeys, predictkeys = cv_obj.do_cv(
        df,
        xcols='wvl',
        ycol=[('comp', 'SiO2')],
        method='Local Regression',
        yrange=[0, 100],
        calc_path=False,
        alphas=None)

    expected_predicts = [51.30212, 54.25293063, 48.54834655, 54.18676067]
    expected_output_rmsec = [10.32151211, 10.89018268]

    np.testing.assert_array_almost_equal(
        expected_predicts, np.array(df_out['predict'].iloc[5, :]))
    np.testing.assert_array_almost_equal(expected_output_rmsec,
                                         np.array(output[('cv', 'RMSEC')]))
    assert output.shape == (2, 11)
    assert len(models) == 2
    assert len(modelkeys) == 2
    assert modelkeys[
        0] == 'Local Regression - SiO2 - (0, 100) {\'fit_intercept\': True, \'positive\': False, \'random_state\': 1, \'tol\': 0.01} n_neighbors: 5'
    assert len(predictkeys) == 4
    assert predictkeys[
        0] == '"Local Regression- CV -{\'fit_intercept\': True, \'positive\': False, \'random_state\': 1, \'tol\': 0.01} n_neighbors: 5"'
    def run(self):
        method = self.chooseAlgorithmComboBox.currentText()
        datakey = self.chooseDataComboBox.currentText()
        xvars = [str(x.text()) for x in self.xVariableList.selectedItems()]
        yvars = [('comp', str(y.text()))
                 for y in self.yVariableList.selectedItems()]
        yrange = [
            self.yMinDoubleSpinBox.value(),
            self.yMaxDoubleSpinBox.value()
        ]
        # Warning: Params passing through cv.cv(params) needs to be in lists
        # Example: {'n_components': [4], 'scale': [False]}
        params, modelkey = self.alg[
            self.chooseAlgorithmComboBox.currentText()].run()

        #if the method supports it, separate out alpha from the other parameters and prepare for calculating path
        path_methods = ['Elastic Net', 'LASSO']  #, 'Ridge']
        if method in path_methods:
            calc_path = True
            alphas = params.pop('alpha')
        else:
            alphas = None
            calc_path = False
        y = np.array(self.data[datakey].df[yvars])
        match = np.squeeze((y > yrange[0]) & (y < yrange[1]))
        data_for_cv = spectral_data(self.data[datakey].df.ix[match])
        paramgrid = list(
            ParameterGrid(params))  # create a grid of parameter permutations
        cv_obj = cv.cv(paramgrid)
        try:
            cv_iterator = LeaveOneGroupOut().split(
                data_for_cv.df[xvars], data_for_cv.df[yvars],
                data_for_cv.df[('meta', 'Folds')]
            )  # create an iterator for cross validation based on the predefined folds
            n_folds = LeaveOneGroupOut().get_n_splits(
                groups=data_for_cv.df[('meta', 'Folds')])

        except:
            print(
                '***No folds found! Did you remember to define folds before running cross validation?***'
            )

        self.data[
            datakey].df, self.cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(
                data_for_cv.df,
                cv_iterator,
                xcols=xvars,
                ycol=yvars,
                yrange=yrange,
                method=method,
                alphas=alphas,
                calc_path=calc_path,
                n_folds=n_folds)
        for key in cvpredictkeys:
            self.list_amend(self.predictkeys, len(self.predictkeys), key)

        for n, key in enumerate(cvmodelkeys):
            self.list_amend(self.modelkeys, len(self.modelkeys), key)
            self.modelkeys.append(key)
            self.models[key] = cvmodels[n]
            self.model_xvars[key] = xvars
            self.model_yvars[key] = yvars
            if method != 'GP':
                coef = np.squeeze(cvmodels[n].model.coef_)
                coef = pd.DataFrame(coef)
                coef.index = pd.MultiIndex.from_tuples(
                    self.data[datakey].df[xvars].columns.values)
                coef = coef.T
                coef[('meta', 'Model')] = key
                try:
                    coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_
                except:
                    pass
                try:
                    self.data['Model Coefficients'] = spectral_data(
                        pd.concat([self.data['Model Coefficients'].df, coef]))
                except:
                    self.data['Model Coefficients'] = spectral_data(coef)
                    self.datakeys.append('Model Coefficients')

        number = 1
        cvid = str('CV Results ' + modelkey + ' - ' + yvars[0][1])
        while cvid in self.datakeys:
            number += 1
            cvid = str('CV Results ' + modelkey + ' - ' +
                       yvars[0][1]) + ' - ' + str(number)

        self.datakeys.append(cvid)
        self.data[cvid] = self.cv_results
    def setup(self):
        try:
            method = self.chooseAlgorithmComboBox.currentText()
            datakey = self.chooseDataComboBox.currentText()
            xvars = [str(x.text()) for x in self.xVariableList.selectedItems()]
            yvars = [('comp', str(y.text()))
                     for y in self.yVariableList.selectedItems()]
            yrange = [
                self.yMinDoubleSpinBox.value(),
                self.yMaxDoubleSpinBox.value()
            ]
            # Warning: Params passing through cv.cv(params) needs to be in lists
            # Example: {'n_components': [4], 'scale': [False]}
            params, modelkey = self.alg[
                self.chooseAlgorithmComboBox.currentText()].run()

            #if the method supports it, separate out alpha from the other parameters and prepare for calculating path
            path_methods = ['Elastic Net', 'LASSO']  #, 'Ridge']
            if method in path_methods:
                alphas = params.pop('alpha')
            else:
                alphas = None

            paramgrid = list(ParameterGrid(
                params))  # create a grid of parameter permutations
            cv_obj = cv.cv(paramgrid)
            cvpredictkeys = []
            cvmodelkeys = []
            for i in range(len(paramgrid)):
                if alphas is not None:
                    for j in range(len(alphas)):
                        keytemp = '"' + method + ' - ' + yvars[0][
                            -1] + ' - CV - Alpha:' + str(
                                alphas[j]) + ' - ' + str(paramgrid[i]) + '"'
                        cvpredictkeys.append(keytemp)
                        keytemp = '"' + method + ' - ' + yvars[0][
                            -1] + ' - Cal - Alpha:' + str(
                                alphas[j]) + ' - ' + str(paramgrid[i]) + '"'
                        cvpredictkeys.append(keytemp)

                        modelkeytemp = "{} - {} - ({}, {}) Alpha: {}, {}".format(
                            method, yvars[0][-1], yrange[0], yrange[1],
                            alphas[j], paramgrid[i])
                        cvmodelkeys.append(modelkeytemp)

                else:
                    keytemp = '"' + method + '- Cal -' + str(
                        paramgrid[i]) + '"'
                    cvpredictkeys.append(keytemp)
                    keytemp = '"' + method + '- Cal -' + str(
                        paramgrid[i]) + '"'
                    cvpredictkeys.append(keytemp)

                    modelkeytemp = "{} - {} - ({}, {}) {}".format(
                        method, yvars[0][-1], yrange[0], yrange[1],
                        paramgrid[i])
                    cvmodelkeys.append(modelkeytemp)

            for key in cvpredictkeys:
                self.list_amend(self.predictkeys, len(self.predictkeys), key)
                self.data[datakey].df[(
                    'predict', key
                )] = 9999  #Need to fill the data frame with dummy values until CV is actually run

            for n, key in enumerate(cvmodelkeys):
                self.list_amend(self.modelkeys, len(self.modelkeys), key)
                self.modelkeys.append(key)
                self.model_xvars[key] = xvars
                self.model_yvars[key] = yvars
                if method != 'GP':
                    coef = self.data[datakey].df[xvars[
                        0]].columns.values * 0.0 + 9999  #Fill with dummy coeffs before model is run
                    coef = pd.DataFrame(coef)
                    coef.index = pd.MultiIndex.from_tuples(
                        self.data[datakey].df[xvars].columns.values)
                    coef = coef.T
                    coef[('meta', 'Model')] = key
                    try:
                        coef[(
                            'meta', 'Intercept'
                        )] = 0  #Fill intercept with zeros prior to model run
                    except:
                        pass
                    try:
                        self.data['Model Coefficients'] = spectral_data(
                            pd.concat(
                                [self.data['Model Coefficients'].df, coef]))
                    except:
                        self.data['Model Coefficients'] = spectral_data(coef)
                        self.datakeys.append('Model Coefficients')

            self.list_amend(self.datakeys, len(self.datakeys),
                            'CV Results ' + modelkey)
        except:
            pass
Пример #7
0
    def run(self):
        self.cv_results_combined = None #clear previous results in case of re-run

        if 'Model Coefficients' in self.datakeys:
            pass
        else:
            Modules.data_count += 1
            self.coef_index = Modules.data_count
            self.list_amend(self.datakeys, self.coef_index, 'Model Coefficients')

        Modules.data_count += 1
        self.results_index = Modules.data_count

        paramgrids = {}
        if self.ARDcheckbox.isChecked():
            paramgrids['ARD']=list(ParameterGrid(self.alg['ARD'][0].run()))
        if self.BRRcheckbox.isChecked():
            paramgrids['BRR']=list(ParameterGrid(self.alg['BRR'][0].run()))
        if self.ENetcheckbox.isChecked():
            enet_params=self.alg['Elastic Net'][0].run()
            params = enet_params[0]
            params['alpha'] = enet_params[1]
            paramgrids['Elastic Net']=list(ParameterGrid(params))
        # if self.GPcheckBox.isChecked():
        #     paramgrids.append(list(ParameterGrid(self.alg['GP - Gaussian Processes'][0].run())))
        if self.LARScheckbox.isChecked():
            paramgrids['LARS']=list(ParameterGrid(self.alg['LARS'][0].run()))
        if self.LASSOcheckBox.isChecked():
            lasso_params=self.alg['LASSO'][0].run()
            params = lasso_params[0]
            params['alpha'] = lasso_params[1]
            paramgrids['LASSO'] = list(ParameterGrid(params))
            #paramgrids['LASSO']={'alphas':lasso_params[1],'params':list(ParameterGrid(lasso_params[0]))}

        if self.OLScheckBox.isChecked():
            paramgrids['OLS']=list(ParameterGrid(self.alg['OLS'][0].run()))
        if self.OMPcheckBox.isChecked():
            paramgrids['OMP']=list(ParameterGrid(self.alg['OMP'][0].run()))
        if self.PLScheckBox.isChecked():
            paramgrids['PLS']=list(ParameterGrid(self.alg['PLS'][0].run()))
        if self.RidgecheckBox.isChecked():
            paramgrids['Ridge']=list(ParameterGrid(self.alg['Ridge'][0].run()))
        if self.SVRcheckBox.isChecked():
            paramgrids['SVR']=list(ParameterGrid(self.alg['SVR'][0].run()))
        if self.LocalcheckBox.isChecked():
            paramgrids['Local Regression']=list(ParameterGrid(self.alg['Local Regression'][0].run()))
        if self.GBRcheckBox.isChecked():
            paramgrids['GBR'] = list(ParameterGrid(self.alg['GBR'][0].run()))
        if self.RFcheckBox.isChecked():
            paramgrids['RF'] = list(ParameterGrid(self.alg['RF'][0].run()))
        datakey = self.chooseDataComboBox.currentText()
        xvars = [str(x.text()) for x in self.xVariableList.selectedItems()]
        yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()]
        yrange = [self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value()]
        y = np.array(self.data[datakey].df[yvars])
        match = np.squeeze((y > yrange[0]) & (y < yrange[1]))
        data_for_cv = spectral_data(self.data[datakey].df.loc[match])


        for key in paramgrids.keys():
            print('===== Cross validating '+key+' =====')
            method=key
            paramgrid = paramgrids[key]

            cv_obj = cv.cv(paramgrid)

            data_for_cv_out, cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(data_for_cv.df, xcols=xvars,
                                                                                         ycol=yvars, yrange=yrange, method=method)

            try:
                cv_results[('cv','Data_file')] = self.datafiles[datakey]
            except:
                pass
            cv_results[('cv','ymin')] = yrange[0]
            cv_results[('cv','ymax')] = yrange[1]
            cv_results[('cv','ycol')] = yvars[0][1]

            data_for_cv = spectral_data(data_for_cv_out)

            self.cv_results_combined = pd.concat((self.cv_results_combined,cv_results))

            for key in cvpredictkeys:
                self.list_amend(self.predictkeys, len(self.predictkeys), key)

            for n, key in enumerate(cvmodelkeys):
                Modules.model_count += 1
                self.list_amend(self.modelkeys, Modules.model_count, key)
                self.models[key] = cvmodels[n]
                self.model_xvars[key] = xvars
                self.model_yvars[key] = yvars
                if method != 'GP':
                    try:
                        coef = np.squeeze(cvmodels[n].model.coef_)
                        coef = pd.DataFrame(coef)
                        coef.index = pd.MultiIndex.from_tuples(self.data[datakey].df[xvars].columns.values)
                        coef = coef.T
                        coef[('meta', 'Model')] = key
                        try:
                            coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_
                        except:
                            pass
                        try:
                            self.data['Model Coefficients'] = spectral_data(
                                pd.concat([self.data['Model Coefficients'].df, coef]))
                        except:
                            self.data['Model Coefficients'] = spectral_data(coef)
                    except:
                        pass

        number = 1
        cvid = str('CV Results - ' + yvars[0][1])
        while cvid in self.datakeys:
            number += 1
            cvid = str('CV Results - ' + yvars[0][1]) + ' - ' + str(number)

        self.list_amend(self.datakeys,self.results_index,cvid)
        self.data[cvid] = spectral_data(self.cv_results_combined)

        Modules.data_count += 1
        new_datakey = datakey + '-' +str(yvars)+' '+ str(yrange)+'-CV Predictions'
        self.list_amend(self.datakeys, Modules.data_count, new_datakey)
        self.data[new_datakey] = spectral_data(data_for_cv_out)
Пример #8
0
    def run(self):
        paramgrids = {}
        if self.ARDcheckbox.isChecked():
            paramgrids['ARD']=list(ParameterGrid(self.alg['ARD'][0].run()))
        if self.BRRcheckbox.isChecked():
            paramgrids['BRR']=list(ParameterGrid(self.alg['BRR'][0].run()))
        if self.ENetcheckbox.isChecked():
            enet_params=self.alg['Elastic Net'][0].run()
            paramgrids['Elastic Net']={'alphas':enet_params[1],'params':list(ParameterGrid(enet_params[0]))}
        # if self.GPcheckBox.isChecked():
        #     paramgrids.append(list(ParameterGrid(self.alg['GP - Gaussian Processes'][0].run())))
        if self.LARScheckbox.isChecked():
            paramgrids['LARS']=list(ParameterGrid(self.alg['LARS'][0].run()))
        if self.LASSOcheckBox.isChecked():
            lasso_params=self.alg['LASSO'][0].run()
            paramgrids['LASSO']={'alphas':lasso_params[1],'params':list(ParameterGrid(lasso_params[0]))}

        if self.OLScheckBox.isChecked():
            paramgrids['OLS']=list(ParameterGrid(self.alg['OLS'][0].run()))
        if self.OMPcheckBox.isChecked():
            paramgrids['OMP']=list(ParameterGrid(self.alg['OMP'][0].run()))
        if self.PLScheckBox.isChecked():
            paramgrids['PLS']=list(ParameterGrid(self.alg['PLS'][0].run()))
        if self.RidgecheckBox.isChecked():
            paramgrids['Ridge']=list(ParameterGrid(self.alg['Ridge'][0].run()))
        if self.SVRcheckBox.isChecked():
            paramgrids['SVR']=list(ParameterGrid(self.alg['SVR'][0].run()))
        if self.LocalcheckBox.isChecked():
            paramgrids['Local Regression']=list(ParameterGrid(self.alg['Local Regression'][0].run()))

        datakey = self.chooseDataComboBox.currentText()
        xvars = [str(x.text()) for x in self.xVariableList.selectedItems()]
        yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()]
        yrange = [self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value()]
        y = np.array(self.data[datakey].df[yvars])
        match = np.squeeze((y > yrange[0]) & (y < yrange[1]))
        data_for_cv = spectral_data(self.data[datakey].df.ix[match])


        for key in paramgrids.keys():
            print('===== Cross validating '+key+' =====')
            method=key
            #if the method supports it, separate out alpha from the other parameters and prepare for calculating path
            path_methods =  ['Elastic Net', 'LASSO']#, 'Ridge']
            if method in path_methods:
                calc_path = True
                alphas = paramgrids[key]['alphas']
                paramgrid = paramgrids[key]['params']
            else:
                alphas = None
                calc_path = False
                paramgrid = paramgrids[key]
            progbar = QtWidgets.QProgressBar()
            cv_obj = cv.cv(paramgrid, progressbar=progbar)

            self.data[datakey].df, cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(data_for_cv.df, xcols=xvars,
                                                                                         ycol=yvars, yrange=yrange, method=method,
                                                                                         alphas = alphas, calc_path = calc_path)
            try:
                self.cv_results_combined = pd.concat((self.cv_results_combined,cv_results))
            except:
                self.cv_results_combined = cv_results

            for key in cvpredictkeys:
                self.list_amend(self.predictkeys, len(self.predictkeys), key)

            for n, key in enumerate(cvmodelkeys):
                self.list_amend(self.modelkeys, len(self.modelkeys), key)
                self.models[key] = cvmodels[n]
                self.model_xvars[key] = xvars
                self.model_yvars[key] = yvars
                if method != 'GP':
                    coef = np.squeeze(cvmodels[n].model.coef_)
                    coef = pd.DataFrame(coef)
                    coef.index = pd.MultiIndex.from_tuples(self.data[datakey].df[xvars].columns.values)
                    coef = coef.T
                    coef[('meta', 'Model')] = key
                    try:
                        coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_
                    except:
                        pass
                    try:
                        self.data['Model Coefficients'] = spectral_data(
                            pd.concat([self.data['Model Coefficients'].df, coef]))
                    except:
                        self.data['Model Coefficients'] = spectral_data(coef)
                        self.datakeys.append('Model Coefficients')

        number = 1
        cvid = str('CV Results - ' + yvars[0][1])
        while cvid in self.datakeys:
            number += 1
            cvid = str('CV Results - ' + yvars[0][1]) + ' - ' + str(number)

        self.datakeys.append(cvid)
        self.data[cvid] = spectral_data(self.cv_results_combined)