Exemplo n.º 1
0
    def entropy_cleaning(self, matrix, targ_limit=150):
        """
		Entropy-cleaning of lightcurve matrix using the SVD U-matrix.

		Parameters:
			matrix (:class:`numpy.ndarray`):
			targ_limit (int, optional): Maximum number of targets to remove during cleaning.

		.. codeauthor:: Mikkel N. Lund <*****@*****.**>
		"""
        logger = logging.getLogger(__name__)

        # Calculate the principle components:
        pca = PCA(self.ncomponents, random_state=self.random_state)
        U, _, _ = pca._fit(matrix)

        ent = compute_entropy(U)
        logger.info('Entropy start: %s', ent)

        targets_removed = 0
        components = np.arange(self.ncomponents)

        with np.errstate(invalid='ignore'):
            while np.any(ent < self.threshold_entropy):
                com = components[ent < self.threshold_entropy][0]

                # Remove highest relative weight target
                m = nanmedian(U[:, com])
                s = mad_to_sigma * nanmedian(np.abs(U[:, com] - m))
                dev = np.abs(U[:, com] - m) / s

                idx0 = np.argmax(dev)

                # Remove the star from the lightcurve matrix:
                star_no = np.ones(U.shape[0], dtype=bool)
                star_no[idx0] = False
                matrix = matrix[star_no, :]

                targets_removed += 1
                if targets_removed >= targ_limit:
                    break

                U, _, _ = pca._fit(matrix)
                ent = compute_entropy(U)

        logger.info('Entropy end: %s', ent)
        logger.info('Targets removed: %d', targets_removed)
        return matrix
Exemplo n.º 2
0
def PCA_on_training_model():
    file_list = interface.get_available_sha256()
    ex_list = np.array([
        pefeatures.PEFeatureExtractor().extract(interface.fetch_file(b))
        for b in file_list
    ])
    print("all_samples: ", ex_list.shape)
    # nor_list = normalize(ex_list, axis=0)
    # nor_list = MinMaxScaler().fit_transform(ex_list)

    nor_list, data_min, data_max, scale_, min_ = MinMaxImp(ex_list)

    pca = PCA(n_components=0.99).fit(nor_list)
    U, S, V = pca._fit(nor_list)
    # dic_elements = {"n_component":pca.n_components_, "scale_":scale_, "min_":min_}
    dic_elements = {"n_component": pca.n_components_}
    np.save("pca_models/features.npy", ex_list)
    np.save("pca_models/nor_features.npy", nor_list)
    np.save("pca_models/U.npy", U)
    np.save("pca_models/S.npy", S)
    np.save("pca_models/V.npy", V)
    np.save("pca_models/scale.npy", scale_)
    np.save("pca_models/min.npy", min_)
    createDictCSV("pca_models/dic_elements.csv", dic_elements)
    print("reduced dimension: ", pca.n_components_)
    return ex_list, nor_list, U, S, V
Exemplo n.º 3
0
def pca(X, rank=None):
    """
    Computes the PCA of X where the observations are on the rows. Suppose X is (n x d) (n observations) and r = min(m, d, rank) then

    U (n x r): the scores
    D (r x r): the singular values
    V (d x r): the loadings

    Parameters
    ----------
    X (numpy matrix/array): the data matrix

    rank (None, int): the number of PCs to compute. If None, will compute
    the full PCA

    Output
    ------
    U, D, V

    """
    # m = np.asarray(X.mean(axis=0)).reshape(-1)
    # m = X.mean(axis=0)
    # X_cent = X - np.outer(np.ones((X.shape[0],)), m)
    pca = PCA(n_components=rank, random_state=42, svd_solver='randomized')
    return pca._fit(X)
Exemplo n.º 4
0
def learn_PCA_matrix_for_spocs_with_sklearn(spocs, desired_dimension):
    print('spocs in learn PCA ', spocs.shape)
    pca = PCA(n_components=desired_dimension)
    U, S, V = pca._fit(torch.t(spocs).cpu().numpy())
    print('U ', U.shape)
    print('S ', S.shape)
    print('V ', V.shape)
    print('pca.components_.shape', pca.components_.shape)
    return U[:, :desired_dimension], S[:desired_dimension]
Exemplo n.º 5
0
def clean_cbv(Matrix, n_components, ent_limit=-1.5, targ_limit=50):
    logger = logging.getLogger(__name__)

    # Calculate the principle components:
    logger.info("Doing Principle Component Analysis...")
    pca = PCA(n_components)
    U, _, _ = pca._fit(Matrix)

    Ent = compute_entopy(U)
    logger.info('Entropy start: ' + str(Ent))

    targets_removed = 0
    components = np.arange(n_components)

    with np.errstate(invalid='ignore'):
        while np.any(Ent < ent_limit):
            com = components[(Ent < ent_limit)][0]

            # Remove highest relative weight target
            m = nanmedian(U[:, com])
            s = 1.46 * nanmedian(np.abs(U[:, com] - m))
            dev = np.abs(U[:, com] - m) / s

            idx0 = np.argmax(dev)

            star_no = np.ones(U.shape[0], dtype=bool)
            star_no[idx0] = False

            Matrix = Matrix[star_no, :]
            U, _, _ = pca._fit(Matrix)

            targets_removed += 1

            if targets_removed > targ_limit:
                break

            Ent = compute_entopy(U)

    logger.info('Entropy end:' + str(Ent))
    logger.info('Targets removed ' + str(int(targets_removed)))
    return Matrix
Exemplo n.º 6
0
    def compute_cbvs(self, targ_limit=150):
        """
		Main function for computing CBVs.

		The steps taken in the function are:

		#. Run :meth:`lightcurve_matrix` to obtain matrix with gap-filled,
		   nan-removed light curves for the most correlated stars in a given cbv-area.

		#. Compute principal components.

		#. Run :meth:`entropy_cleaning` to remove significant single-star
		   contributers based on entropy.

		#. Rerun SNR test on CBVs, and only retain CBVs that pass the test.

		#. Recalculate principal components using cleaned star list.

		#. Save CBVs and make diagnostics plots.

		Parameters:
			targ_limit (int, optional): Maximum number of targets to remove during entropy-cleaning.

		.. codeauthor:: Mikkel N. Lund <*****@*****.**>
		.. codeauthor:: Rasmus Handberg <*****@*****.**>
		"""

        logger = logging.getLogger(__name__)
        logger.info('running CBV')
        logger.info('------------------------------------')

        if 'cbv-ini' in self.hdf:
            logger.info(
                'CBV for SECTOR=%d, CADENCE=%d, AREA=%d already calculated.',
                self.sector, self.cadence, self.cbv_area)
            return
        logger.info('Computing CBV for SECTOR=%d, CADENCE=%d, AREA=%d...',
                    self.sector, self.cadence, self.cbv_area)

        # Extract or compute cleaned and gapfilled light curve matrix
        mat, indx_nancol, Ntimes = self.lightcurve_matrix()

        # Calculate initial CBVs
        logger.info('Computing %d CBVs', self.ncomponents)
        pca = PCA(self.ncomponents, random_state=self.random_state)
        U0, _, _ = pca._fit(mat)

        cbv0 = np.full((Ntimes, self.ncomponents), np.nan, dtype='float64')
        cbv0[~indx_nancol, :] = np.transpose(pca.components_)

        # Clean away targets that contribute significantly
        # as a single star to a given CBV (based on entropy)
        logger.info('Doing Entropy Cleaning...')
        mat = self.entropy_cleaning(mat, targ_limit=targ_limit)

        # Calculate the principle components of cleaned matrix
        logger.info("Doing Principle Component Analysis...")
        U, _, _ = pca._fit(mat)

        cbv = np.full((Ntimes, self.ncomponents), np.nan, dtype='float64')
        cbv[~indx_nancol, :] = np.transpose(pca.components_)

        # Signal-to-Noise test (here only for plotting)
        #indx_lowsnr = cbv_snr_test(cbv, self.threshold_snrtest)

        # Save the CBV to file:
        self.hdf.create_dataset('cbv-ini', data=cbv)

        #------------------------ PLOTS ---------------------------
        # Plot the "effectiveness" of each CBV:
        max_components = 20
        n_cbv_components = np.arange(max_components, dtype=int)
        pca_scores = compute_scores(mat, n_cbv_components)

        fig0 = plt.figure(figsize=(12, 8))
        ax0 = fig0.add_subplot(121)
        ax0.plot(n_cbv_components, pca_scores, 'b', label='PCA scores')
        ax0.set_xlabel('nb of components')
        ax0.set_ylabel('CV scores')
        ax0.legend(loc='lower right')
        ax02 = fig0.add_subplot(122)
        ax02.plot(np.arange(1, cbv0.shape[1] + 1),
                  pca.explained_variance_ratio_, '.-')
        ax02.axvline(x=cbv.shape[1] + 0.5, ls='--', color='k')
        ax02.set_xlabel('CBV number')
        ax02.set_ylabel('Variance explained ratio')
        fig0.savefig(
            os.path.join(
                self.cbv_plot_folder,
                f'cbv-perf-s{self.sector:04d}-c{self.cadence:04d}-a{self.cbv_area:d}.png'
            ))
        plt.close(fig0)

        # Plot all the CBVs:
        fig, axes = plt.subplots(int(np.ceil(self.ncomponents / 2)),
                                 2,
                                 figsize=(12, 16))
        fig2, axes2 = plt.subplots(int(np.ceil(self.ncomponents / 2)),
                                   2,
                                   figsize=(12, 16))
        fig.subplots_adjust(wspace=0.23,
                            hspace=0.46,
                            left=0.08,
                            right=0.96,
                            top=0.94,
                            bottom=0.055)
        fig2.subplots_adjust(wspace=0.23,
                             hspace=0.46,
                             left=0.08,
                             right=0.96,
                             top=0.94,
                             bottom=0.055)

        for k, ax in enumerate(axes.flatten()):
            if k < cbv0.shape[1]:
                #if indx_lowsnr is not None and indx_lowsnr[k]:
                #	col = 'c'
                #else:
                #	col = 'k'

                ax.plot(cbv0[:, k] + 0.1, 'r-')
                ax.plot(cbv[:, k], ls='-', color='k')
                ax.set_title(f'Basis Vector {k+1:d}')

        for k, ax in enumerate(axes2.flatten()):
            if k < U0.shape[1]:
                ax.plot(-np.abs(U0[:, k]), 'r-')
                ax.plot(np.abs(U[:, k]), 'k-')
                ax.set_title(f'Basis Vector {k+1:d}')

        fig.savefig(
            os.path.join(
                self.cbv_plot_folder,
                f'cbvs_ini-s{self.sector:04d}-c{self.cadence:04d}-a{self.cbv_area:d}.png'
            ))
        fig2.savefig(
            os.path.join(
                self.cbv_plot_folder,
                f'U_cbvs-s{self.sector:04d}-c{self.cadence:04d}-a{self.cbv_area:d}.png'
            ))
        plt.close(fig)
        plt.close(fig2)
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

#matplotlib inline

df = pd.read_csv("epilepsy.csv")
df['y'].value_counts()

del df['Unnamed: 0']

y=df['y']
x=df.drop(columns=["y"])

model_lr = LogisticRegression()
model_lr.fit(x,y)
model_lr.score(np.array(x),np.array(y))

from sklearn.decomposition import PCA
pcs = PCA(n_components=33)  #Keep changing until you get evr close to 90
pcs._fit(x)
pcs.explained_variance_
evr=pcs.explained_variance_ratio_
np.sum(evr)
pincomp=pcs.components_
pincomp.shape
scoring_matrix = pcs.transform(x)
scoring_matrix

model_lr_1 = LogisticRegression()
model_lr_1.fit(scoring_matrix,y)
model_lr_1.score(scoring_matrix,y)
Exemplo n.º 8
0
def learn_pca_matrix_for_spocs_with_sklearn(spocs, desired_dimension):
    pca = PCA(n_components=desired_dimension)
    u, s, v = pca._fit(torch.t(spocs).cpu().numpy())
    return u[:, :desired_dimension], s[:desired_dimension]
Exemplo n.º 9
0
def do_pca(X):
    pca = PCA()
    U, S, V = pca._fit(X)
    X_transformed = np.dot(X - pca.mean_, pca.components_.T)
    return pca, X_transformed
Exemplo n.º 10
0
    def compute_cbvs(self, cbv_area, ent_limit=-1.5, targ_limit=150):
        """
		Main function for computing CBVs.

		The steps taken in the function are:
			1: run :py:func:`CBVCorrector.lc_matrix_clean` to obtain matrix with gap-filled, nan-removed light curves
			for the most correlated stars in a given cbv-area
			2: compute principal components and remove significant single-star contributers based on entropy
			3: reun SNR test on CBVs, and only retain CBVs that pass the test
			4: save CBVs and make diagnostics plots

		Parameters:
			*self*: all parameters defined in class init

		Returns:
			Saves CBVs per cbv-area in ".npy" files

		.. codeauthor:: Mikkel N. Lund <*****@*****.**>
		"""

        logger = logging.getLogger(__name__)
        logger.info('running CBV')
        logger.info('------------------------------------')

        if os.path.exists(
                os.path.join(self.data_folder, 'cbv_ini-%s-%d.npy' %
                             (self.datasource, cbv_area))):
            logger.info('CBV for area %d already calculated' % cbv_area)
            return

        else:
            logger.info('Computing CBV for %s area %d' %
                        (self.datasource, cbv_area))

            # Extract or compute cleaned and gapfilled light curve matrix
            mat0, _, indx_nancol, Ntimes = self.lc_matrix_clean(cbv_area)

            # Calculate initial CBVs
            logger.info('Computing %d CBVs' % self.ncomponents)
            pca0 = PCA(self.ncomponents)
            U0, _, _ = pca0._fit(mat0)

            cbv0 = np.empty((Ntimes, self.ncomponents), dtype='float64')
            cbv0.fill(np.nan)
            cbv0[~indx_nancol, :] = np.transpose(pca0.components_)

            logger.info(
                'Cleaning matrix for CBV - remove single dominant contributions'
            )

            # Clean away targets that contribute significantly as a single star to a given CBV (based on entropy)
            mat = clean_cbv(mat0, self.ncomponents, ent_limit, targ_limit)

            # Calculate the principle components of cleaned matrix
            logger.info("Doing Principle Component Analysis...")
            pca = PCA(self.ncomponents)
            U, _, _ = pca._fit(mat)

            cbv = np.empty((Ntimes, self.ncomponents), dtype='float64')
            cbv.fill(np.nan)
            cbv[~indx_nancol, :] = np.transpose(pca.components_)

            #			# Signal-to-Noise test (here only for plotting)
            #			indx_lowsnr = cbv_snr_test(cbv, self.threshold_snrtest)

            # Save the CBV to file:
            np.save(
                os.path.join(self.data_folder, 'cbv_ini-%s-%d.npy' %
                             (self.datasource, cbv_area)), cbv)

            ####################### PLOTS #################################
            # Plot the "effectiveness" of each CBV:
            max_components = 20
            n_cbv_components = np.arange(max_components, dtype=int)
            pca_scores = compute_scores(mat, n_cbv_components)

            fig0 = plt.figure(figsize=(12, 8))
            ax0 = fig0.add_subplot(121)
            ax0.plot(n_cbv_components, pca_scores, 'b', label='PCA scores')
            ax0.set_xlabel('nb of components')
            ax0.set_ylabel('CV scores')
            ax0.legend(loc='lower right')
            ax02 = fig0.add_subplot(122)
            ax02.plot(np.arange(1, cbv0.shape[1] + 1),
                      pca.explained_variance_ratio_, '.-')
            ax02.axvline(x=cbv.shape[1] + 0.5, ls='--', color='k')
            ax02.set_xlabel('CBV number')
            ax02.set_ylabel('Variance explained ratio')
            fig0.savefig(
                os.path.join(
                    self.data_folder,
                    'cbv-perf-%s-area%d.png' % (self.datasource, cbv_area)))
            plt.close(fig0)

            # Plot all the CBVs:
            fig, axes = plt.subplots(int(np.ceil(self.ncomponents / 2)),
                                     2,
                                     figsize=(12, 16))
            fig2, axes2 = plt.subplots(int(np.ceil(self.ncomponents / 2)),
                                       2,
                                       figsize=(12, 16))
            fig.subplots_adjust(wspace=0.23,
                                hspace=0.46,
                                left=0.08,
                                right=0.96,
                                top=0.94,
                                bottom=0.055)
            fig2.subplots_adjust(wspace=0.23,
                                 hspace=0.46,
                                 left=0.08,
                                 right=0.96,
                                 top=0.94,
                                 bottom=0.055)

            for k, ax in enumerate(axes.flatten()):
                try:
                    ax.plot(cbv0[:, k] + 0.1, 'r-')
                    #					if not indx_lowsnr is None:
                    #						if indx_lowsnr[k]:
                    #							col = 'c'
                    #						else:
                    #							col = 'k'
                    #					else:
                    #						col = 'k'
                    ax.plot(cbv[:, k], ls='-', color='k')
                    ax.set_title('Basis Vector %d' % (k + 1))
                except:
                    pass

            for k, ax in enumerate(axes2.flatten()):
                try:
                    ax.plot(-np.abs(U0[:, k]), 'r-')
                    ax.plot(np.abs(U[:, k]), 'k-')
                    ax.set_title('Basis Vector %d' % (k + 1))
                except:
                    pass
            fig.savefig(
                os.path.join(
                    self.data_folder,
                    'cbvs_ini-%s-area%d.png' % (self.datasource, cbv_area)))
            fig2.savefig(
                os.path.join(
                    self.data_folder,
                    'U_cbvs-%s-area%d.png' % (self.datasource, cbv_area)))
            plt.close(fig)
            plt.close(fig2)
Exemplo n.º 11
0
def do_pca(X):
    pca = PCA()
    pca = PCA()
    U, S, V = pca._fit(X)
    X_transformed = np.dot(X - pca.mean_, pca.components_.T)
    return pca, X_transformed
def DPCA_cal(x, h):
    # x_normed = (x - x.min(0)) / x.ptp(0) #peak to peak normalization
    x_normed = preprocessing.scale(
        x
    )  #Standardize a dataset along any axis #Center to the mean and component wise scale to unit variance
    x = x_normed
    x = Utility.Augmentation(x, 1, h)
    pca = PCA()
    pca.fit(x)
    U, S, V = pca._fit(x)
    p_component_threshold = 0.9
    ratio_sum = 0
    for i in range(len(pca.explained_variance_ratio_)):
        if ratio_sum > p_component_threshold:
            break
        else:
            ratio_sum = ratio_sum + pca.explained_variance_ratio_[i]

    # p_component_num = sum(pca.explained_variance_ratio_>p_component_threshold)
    p_component_num = i
    p_hat = pca.components_[:, 0:p_component_num]
    pi_hat = np.matmul(p_hat, np.transpose(p_hat))
    x_hat = np.matmul(x, pi_hat)

    p_til = pca.components_[:, p_component_num:]
    pi_til = np.matmul(p_til, np.transpose(p_til))
    x_til = np.matmul(x, pi_til)

    x_reconstructed = x_hat + x_til

    x_pc = np.matmul(x, p_hat)
    x_res = np.matmul(x, p_til)

    x_hat_reconst = Utility.AugmentReverse(x_hat, 1, h)
    x_til_reconst = Utility.AugmentReverse(x_til, 1, h)
    x_augment_reconst = x_hat_reconst + x_til_reconst

    plt.figure(1)
    plt.subplot(2, 2, 1)
    plt.plot(x)
    plt.title('original x')
    plt.subplot(2, 2, 2)
    plt.plot(x_hat)
    plt.title('x_hat')
    plt.subplot(2, 2, 3)
    plt.plot(x_til)
    plt.title('x_til')
    plt.subplot(2, 2, 4)
    plt.plot(x_reconstructed)
    plt.title('x_reconstructed')

    plt.figure(2)
    plt.subplot(2, 2, 1)
    plt.plot(x)
    plt.title('original x')
    plt.subplot(2, 2, 2)
    plt.plot(x_hat_reconst)
    plt.title('x_hat_reconst')
    plt.subplot(2, 2, 3)
    plt.plot(x_til_reconst)
    plt.title('x_til_reconst')
    plt.subplot(2, 2, 4)
    plt.plot(x_augment_reconst)
    plt.title('x_augment_reconst')

    plt.figure(3)
    plt.subplot(2, 1, 1)
    plt.plot(x_pc)
    plt.title('data in pricipal subspaces')
    plt.subplot(2, 1, 2)
    plt.plot(x_res)
    plt.title('data in residual subspaces')

    plt.show()
    return x_hat_reconst
Exemplo n.º 13
0
def main():
    logger.debug('App started')

    parser = argparse.ArgumentParser(description='Key processing tool')
    parser.add_argument('-t',
                        '--threads',
                        dest='threads',
                        type=int,
                        default=None,
                        help='Number of threads to use for cert download')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_const',
                        const=True,
                        help='enables debug mode')
    parser.add_argument('--verbose',
                        dest='verbose',
                        action='store_const',
                        const=True,
                        help='enables verbose mode')

    parser.add_argument('--dump-json',
                        dest='dump_json',
                        action='store_const',
                        const=True,
                        help='dumps JSON of the filtered certificates')
    parser.add_argument('--dump-cert',
                        dest='dump_cert',
                        action='store_const',
                        const=True,
                        help='dumps PEM of the filtered certificates')

    parser.add_argument(
        '-f',
        '--filter-org',
        dest='filter_org',
        help='Filter out certificates issued with given organization - regex')
    parser.add_argument(
        '--filter-domain',
        dest='filter_domain',
        help='Filter out certificates issued for the given domain - regex')

    parser.add_argument('--pubs',
                        dest='pubs',
                        nargs=argparse.ZERO_OR_MORE,
                        help='File with public keys (PEM)')

    parser.add_argument('--certs',
                        dest='certs',
                        nargs=argparse.ZERO_OR_MORE,
                        help='File with certificates (PEM)')

    parser.add_argument('--ossl',
                        dest='ossl',
                        type=int,
                        default=None,
                        help='OpenSSL generator')

    parser.add_argument('--per-key-stat',
                        dest='per_key_stat',
                        action='store_const',
                        const=True,
                        help='Print prob matching for each key')

    parser.add_argument('--subs',
                        dest='subs',
                        action='store_const',
                        const=True,
                        help='Plot random subgroups charts')
    parser.add_argument('--subs-k',
                        dest='subs_k',
                        type=int,
                        default=5,
                        help='Size of the subset')
    parser.add_argument('--subs-n',
                        dest='subs_n',
                        type=int,
                        default=1000,
                        help='Number of subsets to sample')

    parser.add_argument('--pca-src',
                        dest='pca_src',
                        action='store_const',
                        const=True,
                        help='Plot PCA sampled distribution vs collected one')
    parser.add_argument(
        '--pca-src-n',
        dest='pca_src_n',
        type=int,
        default=10000,
        help='Number of subsets to sample from source distributions')
    parser.add_argument('--pca-src-k',
                        dest='pca_src_k',
                        type=int,
                        default=3,
                        help='Size of the subset from the source distribution')

    parser.add_argument('--pca-grp',
                        dest='pca_grp',
                        action='store_const',
                        const=True,
                        help='Plot PCA on the input keys (groups)')

    parser.add_argument('--mixture',
                        dest='mixture',
                        action='store_const',
                        const=True,
                        help='Mixture distribution on masks - sources')

    parser.add_argument('--distrib',
                        dest='distrib',
                        action='store_const',
                        const=True,
                        help='Plot distributions - to the PDF')

    parser.add_argument('--distrib-mix',
                        dest='distribmix',
                        action='store_const',
                        const=True,
                        help='Plot distributions groups mixed with sources')

    parser.add_argument('--key-dist',
                        dest='plot_key_dist',
                        action='store_const',
                        const=True,
                        help='Plots key mask distribution')

    parser.add_argument('files',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='file with ssl-dump json output')

    args = parser.parse_args()

    last_src_id = 0
    src_names = []
    masks_db = []
    masks_src = []
    cert_db = []
    keys_db = []

    # Input = ssl-dump output
    if len(args.files) > 0:
        # Cert Organization Filtering
        re_org = None if args.filter_org is None else re.compile(
            args.filter_org, re.IGNORECASE)
        # Domain filtering
        re_dom = None if args.filter_domain is None else re.compile(
            args.filter_domain, re.IGNORECASE)

        # Process files
        for fl in args.files:
            with open(fl, mode='r') as fh:
                data = fh.read()

                # Parse json out
                if '-----BEGIN JSON-----' in data:
                    if '-----END JSON-----' not in data:
                        raise ValueError('BEGIN JSON present but END JSON not')
                    match = re.search(
                        r'-----BEGIN JSON-----(.+?)-----END JSON-----', data,
                        re.MULTILINE | re.DOTALL)
                    if match is None:
                        raise ValueError('Could not extract JSON')
                    data = match.group(1)

                json_data = json.loads(data)
                for cert in json_data:
                    org = cert['org']
                    if org is None:
                        org = ''
                    if re_org is not None and re_org.match(org) is None:
                        if args.verbose:
                            print('Organization filtered out %s' % org)
                        continue
                    if re_dom is not None:
                        dom_match = re_dom.match(cert['cn']) is not None
                        for alt in cert['alts']:
                            dom_match |= re_dom.match(alt) is not None
                        if not dom_match:
                            if args.verbose:
                                print('Domain filtered out %s' % cert['cn'])
                            continue

                    cert_db.append(cert)
                    masks_db.append(cert['pubkey']['mask'])
                    masks_src.append(last_src_id)
            src_names.append(fl)
            last_src_id += 1

        if args.verbose:
            print('Certificate database size %d' % len(cert_db))

        if args.dump_json:
            print(json.dumps(cert_db))

        if args.dump_cert:
            for cert in cert_db:
                print cert['cert']

    # public key list processing
    if args.pubs is not None:
        for pubf in args.pubs:
            with open(pubf, mode='r') as fh:
                data = fh.read()
                keys = []
                for match in re.finditer(
                        r'-----BEGIN PUBLIC KEY-----(.+?)-----END PUBLIC KEY-----',
                        data, re.MULTILINE | re.DOTALL):
                    key = match.group(0)
                    keys.append(key)
                print('File %s keys num: %d' % (pubf, len(keys)))

                # pubkey -> mask
                for key in keys:
                    pub = serialization.load_pem_public_key(
                        key, utils.get_backend())
                    mask = keys_basic.compute_key_mask(pub.public_numbers().n)
                    keys_db.append(pub)
                    masks_db.append(mask)
                    masks_src.append(last_src_id)
            src_names.append(pubf)
            last_src_id += 1

    # extract public key from certificate
    if args.certs is not None:
        for certf in args.certs:
            with open(certf, mode='r') as fh:
                data = fh.read()
                certs = []
                for match in re.finditer(
                        r'-----BEGIN CERTIFICATE-----(.+?)-----END CERTIFICATE-----',
                        data, re.MULTILINE | re.DOTALL):
                    cert = match.group(0)
                    certs.append(cert)

                # cert -> mask
                for cert in certs:
                    x509 = utils.load_x509(str(cert))
                    pub = x509.public_key()
                    mask = keys_basic.compute_key_mask(pub.public_numbers().n)
                    keys_db.append(pub)
                    masks_db.append(mask)
                    masks_src.append(last_src_id)
            src_names.append(certf)
            last_src_id += 1

    # generate openssl keys on the fly
    if args.ossl is not None:
        for i in range(0, args.ossl):
            print('Generating RSA1024 key %03d' % i)
            key = OpenSSL.crypto.PKey()
            key.generate_key(OpenSSL.crypto.TYPE_RSA, 1024)
            key_pem = OpenSSL.crypto.dump_privatekey(
                OpenSSL.crypto.FILETYPE_PEM, key)

            priv = serialization.load_pem_private_key(key_pem, None,
                                                      utils.get_backend())
            mask = keys_basic.compute_key_mask(
                priv.public_key().public_numbers().n)
            keys_db.append(priv.public_key())
            masks_db.append(mask)
            masks_src.append(last_src_id)
        src_names.append('ossl-%d' % args.ossl)
        last_src_id += 1

    # Load statistics
    st = key_stats.KeyStats()
    st.load_tables()
    if args.verbose:
        print('Source stats: ')
        for src in st.sources_cn:
            print(' %30s: %08d' % (src, st.sources_cn[src]))
        print('Group stats:')
        for grp in st.groups:
            print(' %30s: %02d' % (grp, st.get_group_size(grp)))

    # mask indices
    mask_map, mask_max, mask_map_x, mask_map_y, mask_map_last_x, mask_map_last_y = keys_basic.generate_pubkey_mask_indices(
    )
    print('Max mask 1D config: [%d]' % mask_max)
    print('Max mask 2D config: [%d, %d]' % (mask_map_last_x, mask_map_last_y))

    # masks processing part
    if len(masks_db) == 0:
        return

    # Simple match
    if args.per_key_stat:
        print('Per-key matching: ')
        for idx, mask in enumerate(masks_db):
            print('Key %02d, mask: %s' % (idx, mask))

            res = []
            for src in st.table_prob:
                val = st.table_prob[src][mask]
                res.append((src, val if val is not None else 0))
            print_res(res, st)

    # Total key matching
    use_loglikelihood = True
    print('Fit for all keys in one distribution:')
    total_weights = src_total_match = comp_total_match_dict(
        masks_db, st, loglikelihood=use_loglikelihood)
    res = key_val_to_list(src_total_match)
    print_res(res, st, loglikelihood=use_loglikelihood)
    res = st.res_src_to_group(res)
    # bar_chart(res=res, title='Fit for all keys')

    # Avg + mean
    print('Avg + mean:')
    src_total_match = {}  # source -> [p1, p2, p3, p4, ..., p_keynum]
    for src in st.table_prob:
        src_total_match[src] = []
        for idx, mask in enumerate(masks_db):
            val = keys_basic.aggregate_mask(st.sources_masks_prob[src], mask)
            if use_loglikelihood:
                if total_weights[src] is not None:
                    src_total_match[src].append(val + total_weights[src])
                else:
                    src_total_match[src].append(-9999.9)
            else:
                src_total_match[src].append(val * total_weights[src])
            pass
        pass
    res = []
    devs = []
    for src in st.sources:
        m = np.mean(src_total_match[src])
        s = np.std(src_total_match[src])
        res.append((src, m))
        devs.append(s)

    # Total output
    print_res(res, st, error=devs, loglikelihood=use_loglikelihood)
    # bar_chart(res=res, error=devs, title='Avg for all keys + error')

    # PCA on the keys - groups
    keys_grp_vec = []
    for idx, mask in enumerate(masks_db):
        keys_grp_vec.append([])
        for src in st.groups:
            keys_grp_vec[idx].append(0)
        for idxs, src in enumerate(st.sources):
            grp = st.src_to_group(src)
            prob = st.table_prob[src][mask]
            keys_grp_vec[idx][st.get_group_idx(grp)] += prob

    if args.pca_grp:
        X = np.array(keys_grp_vec)
        pca = PCA(n_components=2)
        pca.fit(X)
        X_transformed = pca.transform(X)
        print('PCA mean: %s, components: ' % pca.mean_)
        print(pca.components_)

        masks_src_np = np.array(masks_src)
        plt.rcdefaults()
        colors = matplotlib.cm.rainbow(np.linspace(0, 1, last_src_id))
        for src_id in range(0, last_src_id):
            plt.scatter(X_transformed[masks_src_np == src_id, 0],
                        X_transformed[masks_src_np == src_id, 1],
                        label=src_names[src_id],
                        color=colors[src_id],
                        alpha=0.25,
                        marker=',')
        plt.legend(loc="best", shadow=False, scatterpoints=1)
        plt.show()

    # Random subset
    if args.subs:
        masks_db_tup = []
        for idx, mask in enumerate(masks_db):
            masks_db_tup.append((idx, mask, masks_src[idx]))

        # Many random subsets, top groups
        subs_size = args.subs_k
        subs_count = args.subs_n
        groups_cnt = {}
        subs_data = []
        subs_data_mark = []
        dsrc_num = last_src_id + 1

        # Take subs_count samples fro the input masks_db, evaluate it, prepare for PCA
        for i in range(0, subs_count):
            masks = random_subset(masks_db_tup, subs_size)
            src_total_match = comp_total_match_dict([x[1] for x in masks], st)
            res = key_val_to_list(src_total_match)

            total = 0.0
            for tup in res:
                total += tup[1]

            # data vectors for PCA
            tmp_data = []
            for idx, tmp_src in enumerate(st.sources):
                val = src_total_match[tmp_src]
                val = long(math.floor(val * (1000.0 / total)))
                tmp_data.append(val)

            # PCA on groups.
            # if want PCA on sources, use subs_data.append(tmp_data)
            subs_data.append(tmp_data)
            # res_grp_val = st.res_src_to_group(zip(st.sources, tmp_data))
            # subs_data.append([x[1] for x in res_grp_val])

            subs_dsources = {}
            max_dsrc = (0, 0)
            for dsrc in [x[2] for x in masks]:
                if dsrc not in subs_dsources:
                    subs_dsources[dsrc] = 0
                subs_dsources[dsrc] += 1

            for dsrc in subs_dsources:
                if subs_dsources[dsrc] > max_dsrc[1]:
                    max_dsrc = (dsrc, subs_dsources[dsrc])
            tmp_mark = max_dsrc[0]

            if max_dsrc[1] == subs_size:
                tmp_mark = max_dsrc[0]
            else:
                tmp_mark = last_src_id

            subs_data_mark.append(tmp_mark)

            for tup in res:
                src = tup[0]
                score = long(math.floor(tup[1] * (1000.0 / total)))
                if score == 0:
                    continue

                grp = st.src_to_group(src)
                if grp not in groups_cnt:
                    groups_cnt[grp] = score
                else:
                    groups_cnt[grp] += score

                if src not in groups_cnt:
                    groups_cnt[src] = score
                else:
                    groups_cnt[src] += score

            # Equalize group sizes
            for grp in st.groups:
                grp = grp.lower()
                if grp in groups_cnt:
                    groups_cnt[grp] /= float(st.get_group_size(grp))

            # best group only
            # best_src = res[0][0]
            # best_grp = st.src_to_group(best_src)
            # if best_grp not in groups_cnt:
            #     groups_cnt[best_grp] = 1
            # else:
            #     groups_cnt[best_grp] += 1

        print('Combinations: (N, k)=(%d, %d) = %d' %
              (subs_count, subs_size, scipy.misc.comb(subs_count, subs_size)))

        sources = st.groups
        values = []
        for source in sources:
            val = groups_cnt[source] if source in groups_cnt else 0
            values.append(val)
        bar_chart(sources,
                  values,
                  xlabel='# of occurrences as top group (best fit)',
                  title='Groups vs. %d random %d-subsets' %
                  (subs_count, subs_size))

        # PCA stuff
        X = np.array(subs_data)
        pca = PCA(n_components=2)
        pU, pS, pV = pca._fit(X)
        X_transformed = pca.transform(X)
        subs_data_mark_pca = np.array(subs_data_mark)

        print('Sources: ')
        print(st.sources)

        print('PCA input data shape %d x %d' %
              (len(subs_data), len(subs_data[0])))
        print('PCA mean: \n%s \nPCA components: \n' % pca.mean_)
        print(pca.components_)

        print('PCA components x: ')
        for x in pca.components_[0]:
            print x
        print('\nPCA components y: ')
        for y in pca.components_[1]:
            print y

        # print('\nPCA U,S,V')
        # print(pU)
        # print(pS)
        # print(pV)

        colors = ['blue', 'red', 'green', 'gray', 'yellow']

        plt.rcdefaults()
        for src_id in range(0, dsrc_num):
            plt.scatter(X_transformed[subs_data_mark_pca == src_id, 0],
                        X_transformed[subs_data_mark_pca == src_id, 1],
                        color=colors[src_id],
                        alpha=0.5 if src_id < dsrc_num - 1 else 0.2)
        plt.legend(loc="best", shadow=False, scatterpoints=1)

        # plt.scatter([x[0] for x in X_transformed],
        #             [x[1] for x in X_transformed],
        #             alpha=0.5)

        plt.show()

        # PCA against defined sources with known distributions?
        # Creates "background distribution" we want to match to
        if args.pca_src:
            # Four axes, returned as a 2-d array
            plt.rcdefaults()
            #f, axarr = plt.subplots(len(st.sources), 1)
            src_k = args.pca_src_k
            src_n = args.pca_src_n

            # prepare PDF
            ppdf = PdfPages('test.pdf')  # todo-filenae-from-set
            sources_to_test = st.sources[20:25] + [
                x for x in st.sources if 'micro' in x.lower()
            ]

            # compute for each source
            src_mark_idx = len(subs_data_mark)
            subs_data_src = subs_data
            subs_data_mark_src = subs_data_mark
            for src_idx, source in enumerate(sources_to_test):
                # cur_plot = axarr[src_idx]
                cur_plot = plt

                print('Plotting PCA source %s %d/%d' %
                      (source, src_idx + 1, len(sources_to_test)))

                # Extend subs_data_src with draws from the source distribution
                for i in range(0, src_n):
                    masks = []
                    for tmpk in range(0, src_k):
                        masks.append(st.sample_source_distrib(source))
                    src_total_match = comp_total_match_dict(masks, st)
                    res = key_val_to_list(src_total_match)

                    total = 0.0
                    for tup in res:
                        total += tup[1]

                    # data vectors for PCA
                    tmp_data = []
                    for idx, tmp_src in enumerate(st.sources):
                        val = src_total_match[tmp_src]
                        val = long(math.floor(val * (1000.0 / total)))
                        tmp_data.append(val)

                    # PCA on groups.
                    # if want PCA on sources, use subs_data.append(tmp_data)
                    subs_data_src.append(tmp_data)
                    subs_data_mark_src.append(src_mark_idx)

                # PCA stuff
                X = np.array(subs_data_src)
                pca = PCA(n_components=2)
                pU, pS, pV = pca._fit(X)
                X_transformed = pca.transform(X)
                subs_data_mark_pca = np.array(subs_data_mark_src)

                colors = ['blue', 'red', 'green', 'gray', 'yellow']

                # plot input sources
                for src_id in range(0, dsrc_num):
                    cur_plot.scatter(
                        X_transformed[subs_data_mark_pca == src_id, 0],
                        X_transformed[subs_data_mark_pca == src_id, 1],
                        color=colors[src_id],
                        alpha=0.5 if src_id < dsrc_num - 1 else 0.2)

                # plot the source stuff
                cur_plot.scatter(
                    X_transformed[subs_data_mark_pca == src_mark_idx, 0],
                    X_transformed[subs_data_mark_pca == src_mark_idx, 1],
                    color='gray',
                    marker='+',
                    alpha=0.05)

                cur_plot.legend(loc="best", shadow=False, scatterpoints=1)
                cur_plot.title('Src [%s] input: %s' % (source,
                                                       (', '.join(src_names))))

                cur_plot.savefig(ppdf, format='pdf')
                cur_plot.clf()

            print('Finalizing PDF...')
            # plt.savefig(ppdf, format='pdf')
            ppdf.close()
            pass

    if args.distrib:
        # Plotting distributions for groups, to the PDF
        plt.rcdefaults()
        ppdf = PdfPages('groups_distrib.pdf')

        # Compute for each source
        range_ = st.masks
        range_idx = np.arange(len(st.masks))
        for grp_idx, grp in enumerate(st.groups):
            cur_data = st.groups_masks_prob[grp]
            raw_data = [cur_data[x] for x in st.masks]
            cur_plot = plt

            logger.debug('Plotting distribution %02d/%02d : %s ' %
                         (grp_idx + 1, len(st.groups), grp))
            axes = cur_plot.gca()
            axes.set_xlim([0, len(st.masks)])
            cur_plot.bar(range_idx, raw_data, linewidth=0, width=0.4)
            cur_plot.title('%s (%s)' % (grp, get_group_desc(grp, st)))
            cur_plot.savefig(ppdf, format='pdf')
            cur_plot.clf()

        # Print input data - per source
        max_src = max(masks_src)
        bars = []
        for src_id in range(max_src + 1):
            axes = plt.gca()
            axes.set_xlim([0, len(st.masks)])

            map_data = {}
            for mask in st.masks:
                map_data[mask] = 0.0
            for mask_idx, mask in enumerate(masks_db):
                if masks_src[mask_idx] == src_id:
                    map_data[mask] += 1

            raw_data = []
            for mask in st.masks:
                raw_data.append(map_data[mask])

            b1 = plt.bar(range_idx, raw_data, linewidth=0, width=0.4)
            bars.append(b1)

            plt.title('Source %d' % src_id)
            plt.savefig(ppdf, format='pdf')
            plt.clf()

        # Group distribution + source:
        if args.distribmix:
            width = 0.25
            range_idx = np.arange(len(st.masks))

            # One source to the graph
            max_src = max(masks_src)
            cur_plot = plt
            for src_id in range(max_src + 1):

                bars = []
                logger.debug('Plotting mix distribution src %d ' % src_id)

                map_data = {}
                for mask in st.masks:
                    map_data[mask] = 0.0
                for mask_idx, mask in enumerate(masks_db):
                    if masks_src[mask_idx] == src_id:
                        map_data[mask] += 1

                raw_data = []
                for mask in st.masks:
                    raw_data.append(map_data[mask])
                raw_data = np.array(raw_data)
                raw_data /= float(sum(raw_data))

                for grp_idx, grp in enumerate(st.groups):
                    logger.debug(
                        ' - Plotting mix distribution %02d/%02d : %s ' %
                        (grp_idx + 1, len(st.groups), grp))

                    # Source
                    fig, ax = plt.subplots()
                    b1 = ax.bar(range_idx + width,
                                raw_data,
                                linewidth=0,
                                width=width,
                                color='r')
                    bars.append(b1)

                    # Group
                    cur_data2 = st.groups_masks_prob[grp]
                    raw_data2 = [cur_data2[x] for x in st.masks]

                    bar1 = ax.bar(range_idx,
                                  raw_data2,
                                  linewidth=0,
                                  width=width,
                                  color='b')
                    bars.append(bar1)

                    ax.legend(tuple([x[0] for x in bars]),
                              tuple(['Src %d' % src_id, grp]))
                    ax.set_xlim([0, len(st.masks)])

                    cur_plot.title('%s + source %d' % (grp, src_id))
                    cur_plot.savefig(ppdf, format='pdf')
                    cur_plot.clf()

        logger.info('Finishing PDF')
        ppdf.close()
        pass

    if args.mixture:
        # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial#bayesmix
        # 1. Create mixture model = add discrete distributions to the package
        dists = []
        alphabet = mixture.Alphabet(st.masks)
        taken_src = []

        for src in st.sources:
            if 'openssl 1.0.2g' == src or 'microsoft .net' == src:
                pass
            else:
                continue
            print(' - Source: %s' % src)

            taken_src.append(src)
            probs = []
            for m in st.masks:
                probs.append(st.sources_masks_prob[src][m])

            d = mixture.DiscreteDistribution(len(alphabet),
                                             probs,
                                             alphabet=alphabet)
            dists.append(d)

        # 2. Create the model, for now, with even distribution among components.
        comp_weights = [1.0 / len(dists)] * len(dists)
        mmodel = mixture.MixtureModel(len(dists), comp_weights, dists)
        print '-' * 80
        print mmodel
        print '-' * 80

        # dump mixtures to the file
        mixture.writeMixture(mmodel, 'src.mix')

        # 3. Input data - array of input masks
        masks_data = [[x] for x in masks_db]
        data = mixture.DataSet()
        data.fromList(masks_data)
        data.internalInit(mmodel)

        print masks_data
        print data
        print '---------'

        # 4. Compute EM
        # if there is a distribution in the input data which has zero matching inputs,
        # an exception will be thrown. Later - discard such source from the input...
        print mmodel.modelInitialization(data, 1)
        print('EM start: ')

        ress = []
        for r in range(10):
            mmodel.modelInitialization(data, 1)
            emres = mmodel.EM(data, 1000, 0.00000000000000001)
            ress.append(emres)
        emres = max(ress, key=lambda x: x[1])

        # print mmodel.randMaxEM(data, 10, 40, 0.1)
        print emres

        # Plot
        plt.rcdefaults()
        # plt.plot(range(0, len(emres[0][3])), [2.71828**x for x in emres[0][3]], 'o')
        # plt.plot(range(0, len(emres[0][3])), emres[0][3], 'k')
        # plt.show()

        for i in range(0, 5):
            print('-------')
            for idx, src in enumerate(emres[0]):
                print('- i:%02d src: %02d, val: %s' % (i, idx, src[i]))

        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(taken_src)))
        range_ = range(0, len(emres[0][0]))
        bars = []
        for idx, src in enumerate(emres[0]):
            b1 = plt.bar(range_, [2.71828**x for x in src], color=colors[idx])
            bars.append(b1)

        plt.legend(tuple(bars), tuple(taken_src))
        plt.grid(True)
        plt.show()

        # for src in emres[0]:
        #     plt.plot(range(0, len(src)), [2.71828**x for x in src], 'o')
        #     # plt.grid(True)
        #     # plt.show()
        #
        # # plt.scatter(mask_map_last_x, mask_map_last_y, c='red', s=scale, alpha=0.3)
        # # plt.legend()
        # plt.grid(True)
        # plt.show()

    # Chisquare
    for source in st.sources_masks:
        cn = st.sources_cn[source]
        # chi = chisquare()
        # gen = keys_basic.generate_pubkey_mask()

    # 2D Key plot
    if args.plot_key_dist:
        plot_key_mask_dist(masks_db, st)