def change_cwd_data(self):
     cwd = os.getcwd()
     os.chdir(
         DATA_PATH.format(dim=self.__dim,
                          reference=self.__reference_distribution))
     return cwd
示例#2
0
from Constants.Constants import SIZE_SET, DIMENSION_SET, SEED
from Constants.Expanded_Constants import REFERENCE_LIST
from Constants.Storage_Constants import DATA_PATH

n_components = []
explained_variance_ratio = []
for dim in DIMENSION_SET:
    print('Dimension:', dim)
    size_n_components = []
    size_explained_var = []
    for size in SIZE_SET:
        dist_n_components = []
        dist_explained_var = []
        print('\t' + str(size))
        for reference in REFERENCE_LIST:
            os.chdir(DATA_PATH.format(dim=dim, reference=reference))
            size_files = glob.glob('*Data Set ' + str(size) + '.parquet_' +
                                   reference + '_gz')
            feature_columns = ['U' + str(i) for i in range(size + 1)
                               ] + ['V' + str(i) for i in range(size + 1)]
            size_df = pd.concat(map(lambda x: pd.read_parquet(x), size_files),
                                ignore_index=True)[feature_columns]
            for percent in [0.85, 0.90, 0.95, 0.99, 0.999]:
                print('\t\t' + reference + '\t' + str(percent), end=' \t')
                pca = PCA(n_components=percent, random_state=SEED)
                pca.fit(size_df)
                np_values = pca.transform(size_df)
                print(str(pca.n_components_), flush=True)
                n_components.append(pca.n_components)
                dist_n_components.append(pca.n_components)
                print('\t\t\tExplained Variance:',
示例#3
0
import glob
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from Constants.Constants import SIZE_SET
from Constants.Expanded_Constants import REFERENCE_LIST
from Constants.Storage_Constants import DATA_PATH

for size in SIZE_SET:
    print(1, size, 'Complete')
    for reference in REFERENCE_LIST:
        print('\t', reference, sep='')
        os.chdir(DATA_PATH.format(dim=1, reference=reference))
        cluster = glob.glob('*Set ' + str(size) + '.parquet_' + reference +
                            '_gz')
        plt.close('all')
        for file in cluster:
            dist = file.split(' ')[:1]
            dist_name = ' '.join(dist)
            file_df = pd.read_parquet(file).astype(dtype=np.float32)
            print('\t\t', file, sep='')
            u = file_df['U'].to_numpy()
            v = file_df['V'].to_numpy()
            plt.plot(u, v, '.', label=dist_name)
        plt.legend()
        plt.xlabel('U')
        plt.ylabel('V')
        plt.title('Dim 1 Size ' + str(size) + ' ' + reference)