def make_latex_table(config, data, qoi_type=None, param_type=None):
    qoi_types = ['by_qoi_target']
    param_type = []

    assert isinstance(config,str) \
           or isinstance(config,PyposmatConfigurationFile)
    assert isinstance(data,str) \
            or isinstance(data,PyposmatDataFile)

    if isinstance(config, str):
        o_config = PyposmatConfigurationFile()
        o_config.read(filename=config)
    elif isinstance(config, PyposmatConfigurationFile):
        o_config = config
    else:
        raise TypeError()

    if isinstance(data, str):
        o_data = PyposmatDataFile()
        o_data.read(filename=data)
    elif isinstance(data, PyposmatDataFile):
        o_data = data
    else:
        raise TypeError()

    if qoi_type == 'by_qoi_target':
        o_data.create_normalized_errors(normalize_type='by_qoi_target',
                                        qoi_targets=o_config.qoi_targets)
        df = o_data.df[o_data.normalized_error_names]
示例#2
0
def gmm_analysis(config_fn,
                 data_fn,
                 names,
                 output_directory='gmm_analysis',
                 max_components=20):
    assert isinstance(config_fn, str)
    assert isinstance(data_fn, str)
    assert os.path.isfile(config_fn)
    assert os.path.isfile(data_fn)

    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    o_config = PyposmatConfigurationFile()
    o_config.read(filename=config_fn)

    o_data = PyposmatDataFile()
    o_data.read(filename=data_fn)
    o_data.create_normalized_errors(normalize_type='by_qoi_target',
                                    qoi_targets=o_config.qoi_targets)
    o_data.df['score'] = o_data.df[o_config.normalized_error_names].abs().sum(
        axis=1)

    data = o_data.df[names]

    n_components = np.arange(1, max_components)
    models = [
        GaussianMixture(n_components=n, covariance_type='full',
                        random_state=0).fit(data) for n in n_components
    ]

    # AIC analysis
    aic, aic_idx = min(
        (val, idx) for (idx, val) in enumerate([m.aic(data) for m in models]))
    aic_n_components = n_components[aic_idx]
    aic_criteria = [m.aic(data) for m in models]
    # BIC analysis
    bic, bic_idx = min(
        (val, idx) for (idx, val) in enumerate([m.bic(data) for m in models]))
    bic_n_components = n_components[bic_idx]
    bic_criteria = [m.bic(data) for m in models]

    #plot the criteria
    print('bic_n_components:{}'.format(bic_n_components))
    print('aic_n_components:{}'.format(aic_n_components))
    plot_fn = os.path.join(output_directory, 'aic_bic_plot.jpg')
    plot_gmm_aic_bic(filename=plot_fn,
                     n_components=n_components,
                     aic_criteria=aic_criteria,
                     bic_criteria=bic_criteria,
                     aic_n_components=aic_n_components,
                     bic_n_components=bic_n_components)

    filename = os.path.join('gmm_analysis', 'gmm_analysis.jpg')
    plot_gmm(models[bic_n_components], data, filename=filename)
示例#3
0
class BaseAnalysis(object):
    def __init__(self, configuration, data, output_path=None):
        self.configuration = None
        self.data = None
        self.output_path = None

        self._initialize_configuration(configuration=configuration)
        self._initialize_data(data=data)
        self._initialize_output_path(path=output_path)

    def _initialize_configuration(self, configuration):
        if isinstance(configuration, str):
            assert os.path.isfile(configuration)
            self.configuration = PyposmatConfigurationFile()
            self.configuration.read(filename=configuration)
        elif isinstance(configuration, PyposmatConfigurationFile):
            self.configuration = configuration
        else:
            raise TypeError('configuration cannot be type:{}'.format(
                str(type(configuration))))

    def _initialize_data(self, data):
        if isinstance(data, str):
            assert os.path.isfile(data)
            self.data = PyposmatDataFile()
            self.data.read(filename=data)
        elif isinstance(data, PyposmatDataFile):
            self.data = deepcopy(data)
        else:
            raise TypeError('data cannot be type:{}'.format(str(type(data))))

        self.data.create_normalized_errors(
            normalize_type='by_qoi_target',
            qoi_targets=self.configuration.qoi_targets)

    def _initialize_output_path(self, path):
        if path is None:
            self.output_path = None
        elif isinstance(path, str):
            if os.path.isdir(path):
                shutil.rmtree(path)
            os.mkdir(path)
            self.output_path = path
        else:
            raise TypeError
示例#4
0
    'pyposmat.kde.19.out')

if __name__ == "__main__":
    # initialization
    o_plot = PyposmatParallelCoordinatesPlot()

    # add data to plot
    for k, v in parallel_plot_config.items():
        print(k, v)
        if k == 'args':
            pass
        else:
            o_config = PyposmatConfigurationFile()
            o_config.read(filename=v['config_fn'])

            o_data = PyposmatDataFile()
            o_data.read(filename=v['data_fn'])
            o_data.create_normalized_errors(qoi_targets=o_config.qoi_targets)

            o_plot.add_dataframe(color=v['color'],
                                 label=v['label'],
                                 obj=copy.deepcopy(o_data.df),
                                 names=o_data.normalized_names)

    o_plot.make_plot(filename="parallel_plot.png",
                     xlabels=o_data.normalized_error_names,
                     ylabel="% error",
                     title="Si sw",
                     ylim=(-175, 25),
                     legend_loc="lower right")
示例#5
0
                             'pareto_optimization_unconstrained',
                             'pyposmat.config.in')
    data_fn = os.path.join(pypospack_root_dir, 'data', 'Si__sw__data',
                           'pareto_optimization_unconstrained',
                           'pyposmat.kde.20.out')
    ref_config_fn = os.path.join(pypospack_root_dir, 'data', 'Si__sw__data',
                                 'reference_potentials', 'pyposmat.config.in')
    ref_data_fn = os.path.join(pypospack_root_dir, 'data', 'Si__sw__data',
                               'reference_potentials', 'pyposmat.kde.1.out')

    o_config = PyposmatConfigurationFile()
    o_config.read(filename=config_fn)

    o_data = PyposmatDataFile()
    o_data.read(filename=data_fn)
    o_data.create_normalized_errors(normalize_type='by_qoi_target',
                                    qoi_targets=o_config.qoi_targets)
    print(o_config.normalized_error_names)
    print(o_data.df.columns)
    o_data.df['score'] = o_data.df[o_config.normalized_error_names].abs().sum(
        axis=1)

    # do AIC and BIC analysis
    if True:
        name_1 = o_config.qoi_names[0]
        name_2 = o_config.qoi_names[1]
        data = o_data.df[[name_1, name_2]]
        max_components = 21
        gmm_analysis(config_fn=config_fn,
                     data_fn=data_fn,
                     names=[name_1, name_2],
                     output_directory='gmm_analysis',
示例#6
0
class GmmAnalysis(object):

    def __init__(self,
                 configuration,
                 data,
                 names=None,
                 output_path='gmm_analysis',
                 max_components=20):
        self._initialize_configuration(configuration=configuration)
        self._initialize_data(data=data)
        self._initialize_names(names=names)
        self.names = deepcopy(names)
        self.output_path = output_path

        assert isinstance(max_components, int)
        self.max_components = max_components

        self.models = None
        self.aic_criteria = None
        self.bic_criteria = None
        self.cluster_ids = None

    def _initialize_configuration(self, configuration):
        if isinstance(configuration, str):
            assert os.path.isfile(configuration)
            self.configuration = PyposmatConfigurationFile()
            self.configuration.read(filename=configuration)
        elif isinstance(configuration, PyposmatConfigurationFile):
            self.configuration = configuration
        else:
            raise TypeError('configuration cannot be type:{}'.format(str(type(configuration))))

    def _initialize_data(self, data):
        if isinstance(data, str):
            assert os.path.isfile(data)
            self.data = PyposmatDataFile()
            self.data.read(filename=data)
        elif isinstance(data, PyposmatDataFile):
            self.data = deepcopy(data)
        else:
            raise TypeError('data cannot be type:{}'.format(str(type(data))))

        self.data.create_normalized_errors(
                normalize_type='by_qoi_target',
                qoi_targets=self.configuration.qoi_targets)

    def _initialize_names(self, names):
        if isinstance(names, list):
            self.names = list(names)
        else:
            raise TypeError

    def make_gmm_models(self, max_components=None):
        names_ = self.names
        data_ = self.data.df[names_]

        if max_components is not None:
            assert isinstance(max_components, int)
            self.max_components = max_components
        max_components_ = self.max_components

        self.models = {}
        n_components = [int(k) for k in np.arange(1, max_components_)]
        for n in n_components:
            self.models[n] = {}
            self.models[n]['obj'] = GaussianMixture(n_components=n,
                                                    covariance_type='full',
                                                    random_state=0).fit(data_)

    def do_aic_analysis(self):
        # AIC analysis
        models_ = self.models
        names_ = self.names
        data_ = self.data.df[names_]
       
        for k in self.models:
            aic = self.models[k]['obj'].aic(data_)
            self.models[k]['aic'] = aic
        aic, aic_n_components = min([(v['aic'], k) for k, v in self.models.items()])
        
        self.aic_criteria = {
            'min_components':int(aic_n_components),
            'min_value':float(aic)
        }

    def do_bic_analysis(self):
        models_ = self.models
        names_ = self.names
        data_ = self.data.df[names_]
        
        # BIC analysis
        for k in self.models:
            bic = self.models[k]['obj'].bic(data_)
            self.models[k]['bic'] = bic
        bic, bic_n_components = min([(v['bic'], k) for k, v in self.models.items()])
        
        self.bic_criteria = {
            'min_components':int(bic_n_components),
            'min_value':float(bic)
        }

    def do_cluster_analysis(self, n_components):
        names_ = self.names
        data_ = self.data.df[names_]
        gmm = GaussianMixture(
                n_components=n_components,
                covariance_type='full',
                random_state=0).fit(data_)
        self.data.df['cluster_id'] = gmm.predict(data_)
        self.cluster_ids = list(set(self.data.df['cluster_id']))
        self.cluster_ids.sort()

        self.clusters = {}
        for cluster_id in self.cluster_ids:
            self.clusters[cluster_id] = {
                'cluster_id':cluster_id,
                'N':self.data.df.loc[self.data.df['cluster_id'] == cluster_id].shape[0]
            }

    @staticmethod
    def plot_ellipse(position, covariance, ax=None, **kwargs):
        from matplotlib.matches import Ellipse

        if ax is None:
            fig, ax = plt.subplots(1,1)

        if covariance.shape == (2,2):
            U, s, Vt = np.linalg.svd(covariance)
            angle = np.degrees(np.arctan2(U[1,0],U[0,0]))
            width, height = 2 * np.sqrt(s)
        else:
            angle = 0
            width, height = 2 * np.sqrt(covariance)

        # draw ellipse
        for nsig in range(1,4):
            ax.add_patch(Ellipse(position,nsig*width,nsig*height,angle,**kwargs))

    @staticmethod
    def plot(gmm_obj, X, labels=None, ax=None, dpi=1200, filename=None,xlims=None,ylims=None):
        cluster_id = gmmi_obj.fit(X).predict(X)
        plt.close('all')

        if ax is None:
            fig, ax = plt. subplots(1,1)

        if isinstance(X, np.ndarray):
            x = X[:,0]
            y = X[:,1]

        elif isinstance(X, pd.DataFrame):
            x = X[X.columns[0]]
            y = X[X.columns[1]]

        ax.scatter(x,y,c=cluster_id,s=1,cmap='viridis',zorder=2,label=[k+1 for k in cluster_id])

        w_factor = 0.2 / gmm.weights_.max()
        for pos, covar, w in zip(gmm.means_,gmm.covariances_,gmm.weights_):
            print(pos)
            print(covar)
            print(w)
            plot_ellipse(pos,covar,alpha=w*w_factor,ax=ax)

        if labels is None:
            ax.set_xlabel(X.columns[0])
            ax.set_ylabel(X.columns[1])
        else:
            ax.set_xlabel(labels[0])
            ax.set_ylabel(labels[1])

        if xlims is not None:
            ax.set_xlim(xlims)
        if ylims is not None:
            ax.set_ylim(ylims)

        ax.legend()
        #ax.set(adjustable='box', aspect='equal')
        if filename is None:
            plt.show()
        else:
            fig.set_size_inches(5,5)
            fig.tight_layout()
            fig.savefig(filename,dpi=dpi)
示例#7
0
class PcaAnalysis(object):
    def __init__(self,
                 configuration,
                 data,
                 n_components=2,
                 names=None,
                 output_path='pca_analysis'):
        self._initialize_configuration(configuration=configuration)
        self._initialize_data(data=data)
        self._initialize_names(names=names)
        self.output_path = output_path

        assert isinstance(n_components, int)
        self.n_components = n_components

        self.scaler = None
        self.pca = None
        self.cluster_ids = None

    def _initialize_configuration(self, configuration):
        if isinstance(configuration, str):
            assert os.path.isfile(configuration)
            self.configuration = PyposmatConfigurationFile()
            self.configuration.read(filename=configuration)
        elif isinstance(configuration, PyposmatConfigurationFile):
            self.configuration = configuration
        else:
            raise TypeError('configuration cannot be type:{}'.format(
                str(type(configuration))))

    def _initialize_data(self, data):
        if isinstance(data, str):
            assert os.path.isfile(data)
            self.data = PyposmatDataFile()
            self.data.read(filename=data)
        elif isinstance(data, PyposmatDataFile):
            self.data = deepcopy(data)
        else:
            raise TypeError('data cannot be type:{}'.format(str(type(data))))

        self.data.create_normalized_errors(
            normalize_type='by_qoi_target',
            qoi_targets=self.configuration.qoi_targets)

    def _initialize_names(self, names):
        if isinstance(names, list):
            self.names = list(names)
        elif isinstance(names, str):
            if names == 'qois':
                self.names = self.configuration.qoi_names
            elif names == 'parameters':
                self.names = self.configuration.parameter_names
            elif names == 'all':
                self.names = self.configuration.parameter_names + self.configuration.qoi_names
            else:
                raise TypeError
        else:
            raise TypeError

    def make_pca_analysis(self):
        names_ = self.names
        data_ = self.data.df[names_]

        self.scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        self.scaled_values = self.scaler.fit_transform(data_)

        n_components_ = self.n_components
        self.pca = PCA(n_components=n_components_)
        pca_components = self.pca.fit_transform(self.scaled_values)

        for i in range(n_components_):
            self.data.df['pca_{}'.format(i + 1)] = pca_components[:, i]

    def plot_pca_analysis(self):

        fig, ax = plt.subplots(1, 3)

        parameters_df = self.data.df[self.configuration.parameter_names]
        scaled_parameters = StandardScaler().fit_transform(parameters_df)
        parameters_pca = PCA(n_components=2).fit_transform(scaled_parameters)
        self.data.df['pca_param_1'] = parameters_pca[:, 0]
        self.data.df['pca_param_2'] = parameters_pca[:, 1]

        qoi_df = self.data.df[self.configuration.qoi_names]
        scaled_qois = StandardScaler().fit_transform(qoi_df)
        qoi_pca = PCA(n_components=2).fit_transform(scaled_qois)
        self.data.df['pca_qoi_1'] = qoi_pca[:, 0]
        self.data.df['pca_qoi_2'] = qoi_pca[:, 1]

        all_names = self.configuration.parameter_names + self.configuration.qoi_names
        all_df = self.data.df[all_names]
        all_scaled = StandardScaler().fit_transform(all_df)
        all_pca = PCA(n_components=2).fit_transform(all_scaled)
        self.data.df['pca_1'] = all_pca[:, 0]
        self.data.df['pca_2'] = all_pca[:, 1]

        ax[0].scatter(self.data.df['pca_param_1'],
                      self.data.df['pca_param_2'],
                      s=1.)
        ax[0].set_xlabel('pca_param_1')
        ax[0].set_ylabel('pca_param_2')

        ax[1].scatter(self.data.df['pca_qoi_1'],
                      self.data.df['pca_qoi_2'],
                      s=1.)
        ax[1].set_xlabel('pca_qoi_1')
        ax[1].set_ylabel('pca_qoi_2')

        ax[2].scatter(self.data.df['pca_1'], self.data.df['pca_2'], s=1.)
        ax[2].set_xlabel('pca_1')
        ax[2].set_ylabel('pca_2')

        fig.tight_layout()
        plt.show()

    def plot_cluster_analysis(self, cluster_type='dbscan'):
        fig, ax = plt.subplots(1, 3)

        parameters_df = self.data.df[self.configuration.parameter_names]
        scaled_parameters = StandardScaler().fit_transform(parameters_df)
        parameters_pca = PCA(n_components=2).fit_transform(scaled_parameters)
        self.data.df['pca_param_1'] = parameters_pca[:, 0]
        self.data.df['pca_param_2'] = parameters_pca[:, 1]

        qoi_df = self.data.df[self.configuration.qoi_names]
        scaled_qois = StandardScaler().fit_transform(qoi_df)
        qoi_pca = PCA(n_components=2).fit_transform(scaled_qois)
        self.data.df['pca_qoi_1'] = qoi_pca[:, 0]
        self.data.df['pca_qoi_2'] = qoi_pca[:, 1]

        all_names = self.configuration.parameter_names + self.configuration.qoi_names
        all_df = self.data.df[all_names]
        all_scaled = StandardScaler().fit_transform(all_df)
        all_pca = PCA(n_components=2).fit_transform(all_scaled)
        self.data.df['pca_1'] = all_pca[:, 0]
        self.data.df['pca_2'] = all_pca[:, 1]

        if cluster_type == 'dbscan':
            dbscan_args = {'eps': 0.25}
            cluster = DBSCAN(**dbscan_args)
        elif cluster_type == 'optics':
            optics_args = {
                'min_samples': 20,
                'xi': 0.1,
                'min_cluster_size': 0.1
            }
            cluster = OPTICS()

        self.data.df['cluster_id'] = cluster.fit_predict(
            self.data.df[['pca_1', 'pca_2']])
        self.cluster_ids = list(set(self.data.df['cluster_id'].values))

        for i in self.cluster_ids:
            ax[0].scatter(self.data.df['pca_param_1'].loc[
                self.data.df['cluster_id'] == i],
                          self.data.df['pca_param_2'].loc[
                              self.data.df['cluster_id'] == i],
                          s=1.)
            ax[1].scatter(
                self.data.df['pca_qoi_1'].loc[self.data.df['cluster_id'] == i],
                self.data.df['pca_qoi_2'].loc[self.data.df['cluster_id'] == i],
                s=1.)
            ax[2].scatter(
                self.data.df['pca_1'].loc[self.data.df['cluster_id'] == i],
                self.data.df['pca_2'].loc[self.data.df['cluster_id'] == i],
                s=1.)

        ax[0].set_xlabel('pca_param_1')
        ax[0].set_ylabel('pca_param_2')
        ax[1].set_xlabel('pca_qoi_1')
        ax[1].set_ylabel('pca_qoi_2')
        ax[2].set_xlabel('pca_1')
        ax[2].set_ylabel('pca_2')

        fig.tight_layout()
        plt.show()

        pass

    def do_cluster_analysis(self, n_components):
        names_ = self.names
        data_ = self.data.df[names_]
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type='full',
                              random_state=0).fit(data_)
        self.data.df['cluster_id'] = gmm.predict(data_)
        self.cluster_ids = list(set(self.data.df['cluster_id']))
        self.cluster_ids.sort()

        self.clusters = OrderedDict()
        for cluster_id in self.cluster_ids:
            self.clusters[cluster_id] = OrderedDict([
                ('cluster_id', cluster_id),
                ('N', self.data.df.loc[self.data.df['cluster_id'] ==
                                       cluster_id].shape[0])
            ])

        n_clusters = len(self.cluster_ids)
        for i in range(n_clusters):
            self.clusters[i]['weight'] = gmm.weights_[i]
            self.clusters[i]['mean'] = gmm.means_[i, :]
            self.clusters[i]['covariance'] = gmm.covariances_[i, :]

        for i in range(n_clusters):
            self.clusters[i][
                'parameters'] = self._do_parameter_cluster_analysis(i)
            self.clusters[i]['qois'] = self._do_qoi_cluster_analysis(i)

    def _do_parameter_cluster_analysis(self, cluster_id):
        assert isinstance(cluster_id, int)
        assert cluster_id in self.cluster_ids

        data_ = self.data.df.loc[self.data.df['cluster_id'] == cluster_id]
        analysis_dict = OrderedDict()
        analysis_dict['mean'] = data_[
            self.configuration.parameter_names].mean()
        analysis_dict['std'] = data_[self.configuration.parameter_names].std()
        return analysis_dict

    def _do_qoi_cluster_analysis(self, cluster_id):
        assert isinstance(cluster_id, int)
        assert cluster_id in self.cluster_ids

        data_ = self.data.df.loc[self.data.df['cluster_id'] == cluster_id]
        analysis_dict = OrderedDict()
        analysis_dict['mean'] = data_[self.configuration.qoi_names].mean()
        analysis_dict['std'] = data_[self.configuration.qoi_names].std()
        return analysis_dict

    @staticmethod
    def plot_ellipse(position, covariance, ax=None, **kwargs):
        from matplotlib.matches import Ellipse

        if ax is None:
            fig, ax = plt.subplots(1, 1)

        if covariance.shape == (2, 2):
            U, s, Vt = np.linalg.svd(covariance)
            angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
            width, height = 2 * np.sqrt(s)
        else:
            angle = 0
            width, height = 2 * np.sqrt(covariance)

        # draw ellipse
        for nsig in range(1, 4):
            ax.add_patch(
                Ellipse(position, nsig * width, nsig * height, angle,
                        **kwargs))

    @staticmethod
    def plot(gmm_obj,
             X,
             labels=None,
             ax=None,
             dpi=1200,
             filename=None,
             xlims=None,
             ylims=None):
        cluster_id = gmmi_obj.fit(X).predict(X)
        plt.close('all')

        if ax is None:
            fig, ax = plt.subplots(1, 1)

        if isinstance(X, np.ndarray):
            x = X[:, 0]
            y = X[:, 1]

        elif isinstance(X, pd.DataFrame):
            x = X[X.columns[0]]
            y = X[X.columns[1]]

        ax.scatter(x,
                   y,
                   c=cluster_id,
                   s=1,
                   cmap='viridis',
                   zorder=2,
                   label=[k + 1 for k in cluster_id])

        w_factor = 0.2 / gmm.weights_.max()
        for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
            plot_ellipse(pos, covar, alpha=w * w_factor, ax=ax)

        if labels is None:
            ax.set_xlabel(X.columns[0])
            ax.set_ylabel(X.columns[1])
        else:
            ax.set_xlabel(labels[0])
            ax.set_ylabel(labels[1])

        if xlims is not None:
            ax.set_xlim(xlims)
        if ylims is not None:
            ax.set_ylim(ylims)

        ax.legend()
        #ax.set(adjustable='box', aspect='equal')
        if filename is None:
            plt.show()
        else:
            fig.set_size_inches(5, 5)
            fig.tight_layout()
            fig.savefig(filename, dpi=dpi)
import os
import pypospack.utils
from pypospack.pyposmat.data import PyposmatDataFile, PyposmatConfigurationFile
from pypospack.pyposmat.visualization.parallel_plot_new import PyposmatParallelCoordinatesPlot


if __name__ == "__main__":
    ppcp = PyposmatParallelCoordinatesPlot()
    configfile = PyposmatConfigurationFile()
    # sorry about the absolute paths
    configfile.read("/home/seaton/python-repos/pypospack/data/Si__sw__data/pareto_optimization_p_3.5_q_0.5/pyposmat.config.in")
    datafile = PyposmatDataFile()
    datafile.read("/home/seaton/python-repos/pypospack/data/Si__sw__data/pareto_optimization_p_3.5_q_0.5/pyposmat.kde.5.out")
    datafile.create_normalized_errors(qoi_targets=configfile.qoi_targets)
    df = datafile.df
    ppcp.add_dataframe("blue", "kde5", df, names=datafile.normalized_names)
    datafile.read("/home/seaton/python-repos/pypospack/data/Si__sw__data/pareto_optimization_p_3.5_q_0.5/pyposmat.kde.15.out")    
    datafile.create_normalized_errors(qoi_targets=configfile.qoi_targets)
    ppcp.add_datafile("orange", "kde15", datafile, names=datafile.normalized_names)
    ppcp.make_plot(filename="parallel_plot.png", xlabels=datafile.normalized_names, 
                   ylabel="% error", title="Si sw", ylim=(-175, 25), legend_loc="lower right")