Пример #1
0
    def do_bgm(self, n_components=6, seed=42):
        """Bayesian Gaussian Mixture.

        Infer the effective number of components in a Gaussian Mixture Model via variational Bayesian estimation.

        n_effective_componenents < n_components if the model sets some weights close to 0.

        Args:
            n_components (int): Number of components in GMM.
            seed (int): Random seed.

        Returns:
            bgm_output (dict): Labels and probabilities.

        """

        np.random.seed(seed)
        bgm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', weight_concentration_prior=1e-2, weight_concentration_prior_type='dirichlet_process', mean_precision_prior=1e-2, init_params='random', max_iter=100, random_state=seed)

        bgm.fit(self.X)
        bgm_labels = bgm.predict(self.X)
        bgm_prob = bgm.predict_proba(self.X)[:,0]

        bgm_output = {'bgm_labels': bgm_labels, 'bgm_prob': bgm_prob}

        return bgm_output
def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(NotFittedError,
                                 "This BayesianGaussianMixture instance"
                                 " is not fitted yet. Call 'fit' with "
                                 "appropriate arguments before using "
                                 "this method.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
Пример #3
0
 def partition_data(self,args):
     method, j = args
     if method== "vi":
         dp = BayesianGaussianMixture(n_components = self.K,weight_concentration_prior = self.alpha, max_iter=1,init_params='kmeans',weight_concentration_prior_type='dirichlet_process')
         dp.fit(self.X[self.U[j]])
         Z = dp.predict(self.X[self.U[j]]).astype(int)
         Z_star = dp.predict(self.X_star).astype(int)
     if method=="gmm":
         Z,Z_star= self.uncollapsed_dp_partition_alt(j)
     elif method=="kmean":
         km = KMeans(n_clusters=self.K)
         Z = km.fit_predict(self.X[self.U[j]]).astype(int)
         Z_star = km.predict(self.X_star[self.U[j]]).astype(int)
     else:
         Z = np.random.choice(self.K,size = self.N_minibatch,replace=True)
         Z_star = np.random.choice(np.unique(Z),size = self.N_star,replace=True)
     le = LE()
     le.fit(np.hstack((Z,Z_star)))
     Z = le.transform(Z)
     Z_star = le.transform(Z_star)
     if (method=="vi"): #& (self.vi_partition):
         Z_diff = np.setdiff1d(Z_star,Z)
         if Z_diff.size > 0:
             idx = np.hstack((np.where(Z_star==k) for k in Z_diff)).flatten()
             unique_Z = np.unique(Z)
             post_Z = dp.predict_proba(self.X_star[idx])[:,unique_Z]
             Z_star[idx] = [np.random.choice(unique_Z,p = post_Z_i / post_Z_i.sum() ) for post_Z_i in post_Z]
             assert(np.setdiff1d(Z_star,Z).size == 0)
     return(Z,Z_star)
Пример #4
0
    def load(self, phipsis):
        self.length = len(phipsis)

        num_component = min(10, self.length)
        gm_ = GM(n_components=num_component)
        gm_.fit(X=phipsis)
        weights = gm_.weights_
        to_keep = weights > 0.05
        num_component = sum(to_keep)

        gm = GM(n_components=num_component)
        gm.fit(X=phipsis)
        precisions = gm.precisions_cholesky_

        # self.means = gm.means_
        self.phipsis = phipsis
        weight = np.mean(precisions[:, 0, 0]) \
                 + np.mean(precisions[:, 1, 1])
        weight = weight * self.weight_scaling_factor  # for matcher weight
        self.weight = min(weight, 1)
        self.weight *= self.weight_accom_factor
        covs = gm.covariances_
        cov_invs = np.array([np.linalg.inv(cov) for cov in covs])
        cluster_dist = gm.predict_proba(phipsis)
        self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist)
        self.gm = gm  # for matcher weight
        # matcher_weight should be a product of the precision/clustering
        # behaviour of the distribution, and the posterior probability of the
        #  queried point. So, higher clustering but point does not belong in
        # distribution => other pressures acting on queried point => should
        # assign lower weight. Lower clustering and point belong => low
        # clustering means low pressure on point, so it shouldn't matter that
        #  much.
        return
def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(
                NotFittedError, "This BayesianGaussianMixture instance"
                " is not fitted yet. Call 'fit' with "
                "appropriate arguments before using "
                "this estimator.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert adjusted_rand_score(Y, Y_pred) >= .95
Пример #6
0
def detectDoublet(args):
    counts_matrix = readMatrix(args.input, binary=False)
    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=0.06,
                         sim_doublet_ratio=3,
                         n_neighbors=25)
    doublet_scores, _ = scrub.scrub_doublets(
        min_counts=1,
        min_cells=3,
        min_gene_variability_pctl=85,
        mean_center=True,
        normalize_variance=True,
        n_prin_comps=min(30,
                         counts_matrix.get_shape()[0] // 10))

    # Fit a Gaussian mixture model
    X = scrub.doublet_scores_sim_
    X = np.array([X]).T
    gmm = BayesianGaussianMixture(n_components=2,
                                  max_iter=1000,
                                  random_state=2394).fit(X)
    i = np.argmax(gmm.means_)

    probs_sim = gmm.predict_proba(X)[:, i]
    vals = X[np.argwhere(probs_sim > 0.5)].flatten()
    if vals.size == 0:
        threshold = np.amax(X.flatten())
    else:
        threshold = min(vals)

    X = np.array([doublet_scores]).T
    probs = gmm.predict_proba(X)[:, i].tolist()

    with open(args.output, 'w') as fl:
        fl.write('\t'.join(map(str, probs)))
        fl.write("\n")

        fl.write(str(threshold))
        fl.write("\n")
        fl.write('\t'.join(map(str, (doublet_scores.tolist()))))
        fl.write("\n")
        fl.write('\t'.join(map(str, scrub.doublet_scores_sim_)))
Пример #7
0
def create_dpgmm_proba(
    train_features,
    test_features,
    columns,
    path=None,
    config={},
    kind="g",
    is_concat=False,
):
    from sklearn.mixture import BayesianGaussianMixture

    if is_concat:
        print("Caution: You use test data to make dpgmm feature.")
        data = pd.concat([train_features[columns], test_features[columns]], axis=0)
        if path is None:
            dpgmm = BayesianGaussianMixture(**config)
            dpgmm.fit(data)
        else:
            with open(path, "rb") as f:
                dpgmm = joblib.load(f)
        proba = dpgmm.predict_proba(data)
        train2 = proba[: train_features.shape[0]]
        test2 = proba[-test_features.shape[0] :]
    else:
        if path is None:
            dpgmm = BayesianGaussianMixture(**config)
            dpgmm.fit(train_features[columns])
        else:
            with open(path, "rb") as f:
                dpgmm = joblib.load(f)
        train2 = dpgmm.predict_proba(train_features[columns])
        test2 = dpgmm.predict_proba(test_features[columns])
    n_cluster = train2.shape[1]
    train2 = pd.DataFrame(
        train2, columns=[f"dpgmm_{kind}-{i}" for i in range(n_cluster)]
    )
    test2 = pd.DataFrame(test2, columns=[f"dpgmm_{kind}-{i}" for i in range(n_cluster)])
    train_features = pd.concat((train_features, train2), axis=1)
    test_features = pd.concat((test_features, test2), axis=1)
    return train_features, test_features
Пример #8
0
    def load(self, phipsis):
        self.length = len(phipsis)
        if np.allclose(phipsis, np.full(phipsis.shape, 360)):
            self.to_skip = True
            return
        i_to_ignore = np.array(phipsis == np.array([360., 360.]))[:, 0]
        self.ignored_i = i_to_ignore
        phipsis = phipsis[~i_to_ignore]

        phipsi_median = np.median(phipsis, axis=0)
        phipsis = phipsis - phipsi_median
        phipsis[phipsis > 180] -= 360.
        phipsis[phipsis < -180] += 360.

        gm_ = GM(n_components=30)
        gm_.fit(X=phipsis)
        weights = gm_.weights_
        to_keep = weights > 0.05
        num_component = sum(to_keep)

        gm = GM(n_components=num_component)
        gm.fit(X=phipsis)
        precisions = gm.precisions_cholesky_

        # self.means = gm.means_
        self.phipsis = phipsis
        self.medians = phipsi_median
        weight = np.mean(precisions[:, 0, 0]) \
                 + np.mean(precisions[:, 1, 1])
        weight = weight * self.weight_scaling_factor  # for matcher weight
        self.weight = float(min(weight, 1.))

        covs = gm.covariances_
        cov_invs = np.array([np.linalg.inv(cov) for cov in covs])
        cluster_dist = gm.predict_proba(phipsis)
        self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist)
        self.gm = gm  # for matcher weight
        # matcher_weight should be a product of the precision/clustering
        # behaviour of the distribution, and the posterior probability of the
        #  queried point. So, higher clustering but point does not belong in
        # distribution => other pressures acting on queried point => should
        # assign lower weight. Lower clustering and point belong => low
        # clustering means low pressure on point, so it shouldn't matter that
        #  much.
        return
Пример #9
0
class GMMTask(BaseInternalTask):
    """"""
    def __init__(self,
                 n_components,
                 db_fn,
                 n_iter,
                 tids=None,
                 split=None,
                 alg='em'):
        """"""
        super(GMMTask, self).__init__(n_components, db_fn, n_iter, tids, split,
                                      alg)
        self.A, self.doc_hash = self.read(db_fn)

        if alg == 'em':
            self.gmm = GaussianMixture(self.k, max_iter=n_iter)
        elif alg == 'variational':
            self.gmm = BayesianGaussianMixture(self.k, max_iter=n_iter)

    def process(self):
        """"""
        if self.tids is not None:
            keep_doc = OrderedDict(
                filter(lambda x: x[0] in self.tids, self.doc_hash.items()))
            self.A = self.A[keep_doc.values()]
            self.doc_hash = OrderedDict(
                zip(keep_doc.keys(), range(len(keep_doc))))

        self.gmm.fit(self.A)
        self.U = self.gmm.predict_proba(self.A)

    @property
    def data(self):
        """"""
        return {
            'item_factors': self.U,
            'term_factors': self.V,
            'tids': self.doc_hash.keys(),
            'uids': self.term_hash.keys(),
            'factor_labels': self.gmm.means_
        }
Пример #10
0
def main():
    infile = sys.argv[1]
    outfile = sys.argv[2]
    k = int(sys.argv[3])

    data = np.genfromtxt(infile, delimiter=',')

    print(
        "Received {} points, clustering with {} mixture components and 2 inits"
        .format(data.shape[0], k))

    if data.size > 0:
        scaler = StandardScaler()
        clusterer = BayesianGaussianMixture(
            k,
            n_init=2,
        )

        data = scaler.fit_transform(data)

        converged = False
        while not converged:
            try:
                clusterer.fit(data)
                converged = True
            except ValueError:
                clusterer.n_components -= 1
                print(f"Retrying with {clusterer.n_components} components.")

        labels = clusterer.predict_proba(data)
        labels = prune(clusterer, labels, 0.001)
        print("Finished clustering")
    else:
        labels = []
        print("Insufficient data to cluster")

    with open(outfile, 'w') as f:
        for sample in labels:
            f.write(", ".join(map(str, sample)))
            f.write("\n")
Пример #11
0
            plot_segmentation(Y[30:-30, 30:-30], savefn=fn_segs)
        '''Scikit's VB GMM'''

        # Initialize model
        model = BayesianGaussianMixture(n_components=K,
                                        max_iter=max_iter,
                                        verbose=3)

        # Fit model
        model.fit(X.reshape((-1, 1)))

        # Segment image
        Y_h = model.predict(X.reshape((-1, 1))).reshape((H, W))

        # Obtain posteriors
        post = model.predict_proba(X.reshape((-1, 1))).reshape((H, W, K))

        # Set cluster assignments to correct tissue labels
        Y_h = set_classes(Y_h, z)

        # Compute error
        err[0, n, r] = np.mean(Y_h[M] != Y[M])
        dcc[0, n, r] = dice(Y_h[M], Y[M])

        if vis:

            fn_segs = fn + 'SCK_segs' + str(n + 1) + '_r' + str(r + 1) + '.png'
            plot_segmentation(Y_h[30:-30, 30:-30], savefn=fn_segs)

            fn_segs = fn + 'SCK_segl' + str(n + 1) + '_r' + str(r + 1) + '.png'
            plot_clustering(X[30:-30, 30:-30, 0],
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]

# Train model
# Setting n_components higher than needed
# BGM weights zero for unnecessary clusters
bgm = BayesianGaussianMixture(n_components=10, n_init=10, random_state=42)
bgm.fit(X)
print("EM Estimates", bgm.weights_)
print("EM Means", bgm.means_[:4])
print("EM Covariances", bgm.covariances_[:3])
print("Convergance, and iterations", bgm.converged_, bgm.n_iter_)
print("Hard clustering predictions", bgm.predict(X))
print("Hard clustering probabilities", bgm.predict_proba(X)[:1])

# Train models with high weight concentrations on datapoints
# weight_concentration_prior 0.01, 10000; with weight prior's dictating
# optimal number clusters
bgm_low = BayesianGaussianMixture(n_components=10,
                                  max_iter=1000,
                                  n_init=1,
                                  weight_concentration_prior=0.01,
                                  random_state=42)
bgm_high = BayesianGaussianMixture(n_components=10,
                                   max_iter=1000,
                                   n_init=1,
                                   weight_concentration_prior=10000,
                                   random_state=42)
nn = 73
Пример #13
0
random_seed = 27132
n_components = 10
rng = np.random.RandomState(seed=random_seed)  # fix a seed

DProc = BayesianGaussianMixture(
    n_components=n_components,
    weight_concentration_prior_type="dirichlet_process",
    weight_concentration_prior=1e-1,
    n_init=7,
    init_params='kmeans',  # default 'kmeans'
    random_state=random_seed  # if int, then taken as random seed
).fit(Y)  # random_state=random_state

results = DProc.predict(Y)
probs = DProc.predict_proba(Y)
res_prob = np.column_stack((probs, results))
# res_prob = np.around(res_prob, decimals = 3)  # around() for arrays

# context manager controls precision within the next block of print commands
with printoptions(precision=3, suppress=True):
    print(results)
    print("\nposterior prob:\n", probs)
    print("\nmean:\n", DProc.means_)
    print("\ncovariances\n", DProc.covariances_)
    print("\nweights", DProc.weights_)
    print("\nCount the clusters\n")
    print(Counter(results).keys())  # equals to list(set(words))
    print(Counter(results).values())  # count freq of the elements

np.savetxt('BGM_hypo4_Out.csv', results, fmt='%1.1f', delimiter=',')
Пример #14
0
def cluster_bayesian_gmm(onehot_input):
    bgmm = BayesianGaussianMixture(n_components=10).fit(onehot_input)
    input = bgmm.predict_proba(onehot_input)
    input = np.array(input)

    return input
Пример #15
0
    def fit(self, X, Y, epochs, batch_size):

        EPOCHS = epochs
        BATCH_SIZE = batch_size
        n = len(X)
        XY = np.concatenate((X, Y), axis=1)
        #df = n - 1

        self._X = X.copy()
        hidden_neurons = self.hidden_neurons

        if self.n_mixtures == -1:
            lowest_bic = np.infty
            bic = []
            n_components_range = range(1, 7)
            cv_types = ['spherical', 'tied', 'diag', 'full']
            for cv_type in cv_types:
                for n_components in n_components_range:
                    # Fit a Gaussian mixture with EM
                    gmm = GaussianMixture(n_components=n_components,
                                          covariance_type=cv_type,
                                          max_iter=10000)
                    gmm.fit(XY)
                    bic.append(gmm.bic(XY))
                    if bic[-1] < lowest_bic:
                        lowest_bic = bic[-1]
                        best_gmm = gmm
                        self.n_mixtures = n_components

            clusterer = HDBSCAN()
            clusterer.fit(XY)
            clusterer.labels_

            if len(np.unique(clusterer.labels_)) < self.n_mixtures:
                self.n_mixtures = len(np.unique(clusterer.labels_))
            else:
                pass

            if self.gmm_boost == True:
                if len(np.unique(clusterer.labels_)) < self.n_mixtures:
                    clusterer = HDBSCAN()
                    clusterer.fit(X)
                    clusters = clusterer.labels_
                else:
                    clusterer = best_gmm
                    clusterer.fit(X)
                    clusters = clusterer.predict_proba(X)

                self._clusterer = clusterer

                X = np.concatenate((X, clusters), axis=1)

            else:
                pass

        elif self.gmm_boost == True:

            clusterer1 = BayesianGaussianMixture(n_components=self.n_mixtures,
                                                 covariance_type='full',
                                                 max_iter=10000)
            clusterer1.fit(X)
            clusters = clusterer1.predict_proba(X)
            self._clusterer = clusterer1

            clusterer2 = HDBSCAN()
            clusterer2.fit(X)

            if len(np.unique(clusterer2.labels_)) < self.n_mixtures:
                clusters = clusterer2.labels_
                self._clusterer = clusterer2
            else:
                pass

            X = np.concatenate((X, clusters), axis=1)

        else:
            pass

        self._y = Y.copy()

        dataset = tf.compat.v1.data.Dataset \
         .from_tensor_slices((X, Y)) \
         .repeat(EPOCHS).shuffle(len(X)).batch(BATCH_SIZE)
        iter_ = tf.compat.v1.data.make_one_shot_iterator(dataset)

        x, y = iter_.get_next()

        K = self.n_mixtures

        self.K = K
        self.x = x

        input_activation = self.input_activation
        hidden_activation = self.hidden_activation

        if input_activation.lower() == 'crelu':
            input_actv = tf.nn.crelu
        elif input_activation.lower() == 'relu6':
            input_actv = tf.nn.relu6
        elif input_activation.lower() == 'elu':
            input_actv = tf.nn.elu
        elif input_activation.lower() == 'selu':
            input_actv = tf.nn.selu
        elif input_activation.lower() == 'leaky_relu':
            input_actv = tf.nn.leaky_relu
        elif input_activation.lower() == 'relu':
            input_actv = tf.nn.relu
        elif input_activation.lower() == 'swish':
            input_actv = tf.nn.swish
        elif input_activation.lower() == 'tanh':
            input_actv = tf.nn.tanh
        elif input_activation.lower() == 'linear':
            input_actv = None
        elif input_activation.lower() == 'softplus':
            input_actv = tf.nn.softplus
        elif input_activation.lower() == 'sigmoid':
            input_actv = tf.nn.sigmoid
        elif input_activation.lower() == 'softmax':
            input_actv = tf.nn.softmax
        else:
            input_actv = tf.nn.relu

        if hidden_activation.lower() == 'crelu':
            h_actv = tf.nn.crelu
        elif hidden_activation.lower() == 'relu6':
            h_actv = tf.nn.relu6
        elif hidden_activation.lower() == 'elu':
            h_actv = tf.nn.elu
        elif hidden_activation.lower() == 'selu':
            h_actv = tf.nn.selu
        elif hidden_activation.lower() == 'leaky_relu':
            h_actv = tf.nn.leaky_relu
        elif hidden_activation.lower() == 'relu':
            h_actv = tf.nn.relu
        elif hidden_activation.lower() == 'swish':
            h_actv = tf.nn.swish
        elif hidden_activation.lower() == 'tanh':
            h_actv = tf.nn.tanh
        elif hidden_activation.lower() == 'linear':
            h_actv = None
        elif hidden_activation.lower() == 'softplus':
            h_actv = tf.nn.softplus
        elif hidden_activation.lower() == 'sigmoid':
            h_actv = tf.nn.sigmoid
        elif hidden_activation.lower() == 'softmax':
            h_actv = tf.nn.softmax
        else:
            h_actv = tf.nn.relu

        n_layer = len(hidden_neurons)

        if n_layer < 1:
            self.layer_last = tf.layers.dense(x,
                                              units=self.input_neurons,
                                              activation=input_actv)
            self.mu = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=None,
                                      name="mu")
            self.var = tf.exp(
                tf.layers.dense(self.layer_last,
                                units=K,
                                activation=None,
                                name="sigma"))
            self.pi = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=tf.nn.softmax,
                                      name="mixing")

        else:
            self.layer_1 = tf.layers.dense(x,
                                           units=self.input_neurons,
                                           activation=input_actv)
            for i in range(2, n_layer + 2):

                n_neurons = hidden_neurons[i - 2]

                if i == n_layer + 1:
                    print('last', i)
                    string_var = 'self.layer_last = tf.layers.dense(self.layer_' + str(
                        i - 1) + ', units=n_neurons, activation=h_actv)'
                else:
                    print(i)
                    string_var = 'self.layer_' + str(
                        i) + ' = tf.layers.dense(self.layer_' + str(
                            i - 1) + ', units=n_neurons, activation=h_actv)'

                exec(string_var)

            self.mu = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=None,
                                      name="mu")
            self.var = tf.exp(
                tf.layers.dense(self.layer_last,
                                units=K,
                                activation=None,
                                name="sigma"))
            self.pi = tf.layers.dense(self.layer_last,
                                      units=K,
                                      activation=tf.nn.softmax,
                                      name="mixing")

        if self.tf_mixture_family == False:
            #---------------- Not using TF Mixture Family ------------------------
            if self.dist.lower() == 'normal':
                self.likelihood = tfp.distributions.Normal(loc=self.mu,
                                                           scale=self.var)
            elif (self.dist.lower() == 'laplacian'
                  or self.dist.lower() == 'laplace') == True:
                self.likelihood = tfp.distributions.Laplace(loc=self.mu,
                                                            scale=self.var)
            elif self.dist.lower() == 'lognormal':
                self.likelihood = tfp.distributions.LogNormal(loc=self.mu,
                                                              scale=self.var)
            elif self.dist.lower() == 'gamma':
                alpha = (self.mu**2) / self.var
                beta = self.var / self.mu
                self.likelihood = tfp.distributions.Gamma(concentration=alpha,
                                                          rate=beta)
            else:
                self.likelihood = tfp.distributions.Normal(loc=self.mu,
                                                           scale=self.var)

            self.out = self.likelihood.prob(y)
            self.out = tf.multiply(self.out, self.pi)
            self.out = tf.reduce_sum(self.out, 1, keepdims=True)
            self.out = -tf.log(self.out + 1e-10)
            self.mean_loss = tf.reduce_mean(self.out)

        else:
            # -------------------- Using TF Mixture Family ------------------------
            self.mixture_distribution = tfp.distributions.Categorical(
                probs=self.pi)

            if self.dist.lower() == 'normal':
                self.distribution = tfp.distributions.Normal(loc=self.mu,
                                                             scale=self.var)
            elif (self.dist.lower() == 'laplacian'
                  or self.dist.lower() == 'laplace') == True:
                self.distribution = tfp.distributions.Laplace(loc=self.mu,
                                                              scale=self.var)
            elif self.dist.lower() == 'lognormal':
                #self.distribution = tfp.edward2.LogNormal(loc=self.mu, scale=self.var)
                self.distribution = tfp.distributions.LogNormal(loc=self.mu,
                                                                scale=self.var)
            elif self.dist.lower() == 'gamma':
                alpha = (self.mu**2) / self.var
                beta = self.var / self.mu
                self.distribution = tfp.distributions.Gamma(
                    concentration=alpha, rate=beta)
            else:
                self.distribution = tfp.distributions.Normal(loc=self.mu,
                                                             scale=self.var)

            self.likelihood = tfp.distributions.MixtureSameFamily(
                mixture_distribution=self.mixture_distribution,
                components_distribution=self.distribution)
            self.log_likelihood = -self.likelihood.log_prob(tf.transpose(y))

            self.mean_loss = tf.reduce_mean(self.log_likelihood)

# ----------------------------------------------------------------------

        self.global_step = tf.Variable(0, trainable=False)

        if self.optimizer.lower() == 'adam':
            self.train_op = tf.compat.v1.train.AdamOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'adadelta':
            self.train_op = tf.compat.v1.train.AdadeltaOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'adagradda':
            self.train_op = tf.compat.v1.train.AdagradDAOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'adagrad':
            self.train_op = tf.compat.v1.train.AdagradOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'ftrl':
            self.train_op = tf.compat.v1.train.FtrlOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'gradientdescent':
            self.train_op = tf.compat.v1.train.GradientDescentOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'momentum':
            self.train_op = tf.compat.v1.train.MomentumOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)

        elif self.optimizer.lower() == 'proximaladagrad':
            self.train_op = tf.compat.v1.train.ProximalAdagradOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'proximalgradientdescent':
            self.train_op = tf.compat.v1.train.ProximalGradientDescentOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)
        elif self.optimizer.lower() == 'rmsprop':
            self.train_op = tf.compat.v1.train.RMSPropOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)

        else:
            self.train_op = tf.compat.v1.train.AdamOptimizer(
                learning_rate=self.learning_rate).minimize(self.mean_loss)

        self.init = tf.compat.v1.global_variables_initializer()

        # Initialize coefficients
        self.sess = tf.compat.v1.Session()
        self.sess.run(self.init)

        best_loss = 1e+10
        self.stopping_step = 0
        for i in range(EPOCHS * (n // BATCH_SIZE)):
            _, loss, mu, var, pi, x__ = self.sess.run([
                self.train_op, self.mean_loss, self.mu, self.var, self.pi,
                self.x
            ])

            if loss < best_loss:
                self.stopping_step = 0
                self.best_loss = loss

                best_mu = mu
                best_var = var
                best_pi = pi
                best_mean_y = mu[:, 0]
                best_x = x__
                best_loss = loss
                print("Epoch: {} Loss: {:3.3f}".format(i, loss))
            else:
                self.stopping_step += 1

            if self.stopping_step >= self.early_stopping:
                self.should_stop = True
                print("Early stopping is trigger at step: {} loss:{}".format(
                    i, loss))
                return
            else:
                pass

            self._mean_y_train = mu[:, 0]
            self._dist_mu_train = mu
            self._dist_var_train = var
            self._dist_pi_train = pi
            self._x_data_train = x__
Пример #16
0
print(input4bgmm.shape)

#clustering
grouper = BGM(n_components=nCluster)
grouper.fit(input4bgmm)
if tosavemodel:
    #restore the model
    pickle.dump(grouper, open(savename, 'wb'))

Tprocess1 = time.time()
print('\n', '## CLUSTERING RUNTIME:', Tprocess1 - Tprocess0)  #Timer end

#brief examination
y_pred = grouper.predict(input4bgmm)
y_max = np.max(y_pred)
y_proba = grouper.predict_proba(
    input4bgmm)  #probability of being a certain group

#group = [(number of group members): images, group label, probability for each group]
group = [[] for _ in range(y_max + 1)]
id_group = [[] for _ in range(y_max + 1)]
group_noise = []  #not in any group
for ix in range(len(y_pred)):
    for ig in range(len(group)):
        if y_pred[ix] == ig:
            tmp = [
                X_train[ix].reshape(imagesize[0], imagesize[1]), y_proba[ix]
            ]
            group[ig].append(tmp)
            id_group[ig].append(id_train[ix])
        elif y_pred[ix] == -1:
            tmp = [
Пример #17
0
class Pyxelate:

    CONVOLUTIONS = np.array(
        [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]],
         [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]],
         [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]],
         [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]],
         [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]],
        dtype="int")

    SOLUTIONS = np.array([
        [[1, 1], [1, 1]],
        [[0, 1], [1, 1]],
        [[1, 0], [1, 1]],
        [[1, 1], [0, 1]],
        [[1, 1], [1, 0]],
        [[1, 1], [0, 0]],
        [[0, 0], [1, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [0, 0]],
        [[0, 1], [0, 0]],
        [[0, 0], [1, 0]],
        [[0, 0], [0, 1]],
    ],
                         dtype="bool")

    ITER = 2

    def __init__(self,
                 height,
                 width,
                 color=8,
                 dither=True,
                 regenerate_palette=True,
                 random_state=0):
        """Create instance for generating similar pixel arts."""

        self.height = int(height)
        self.width = int(width)
        if self.width < 1 or self.height < 1:
            raise ValueError("Result can not be smaller than 1x1 pixels.")
        self.color = int(color)
        if self.color < 2:
            raise ValueError("The minimum number of colors is 2.")
        if dither:
            self.dither = 1 / (self.color + 1)
        else:
            self.dither = 0.
        self.regenerate_palette = bool(regenerate_palette)
        self.is_fitted = False
        self.random_state = int(random_state)
        self.model = BayesianGaussianMixture(
            n_components=self.color,
            max_iter=256,
            covariance_type="tied",
            weight_concentration_prior_type="dirichlet_distribution",
            mean_precision_prior=1. / 256.,
            warm_start=False,
            random_state=self.random_state)

    def convert(self, image):
        """Generate pixel art from image"""
        # apply adaptive contrast
        image = equalize_adapthist(image) * 255 * 1.14
        image[image <= 8.] = 0.

        # create sample for finding palette
        if self.regenerate_palette or not self.is_fitted:
            examples = resize(image, (32, 32),
                              anti_aliasing=False).reshape(-1, 3).astype("int")
            self.model.fit(examples)
            self.is_fitted = True

        # resize image to 4 times the desired width and height
        image = resize(
            image, (self.height * self.ITER * 2, self.width * self.ITER * 2),
            anti_aliasing=True)
        # generate pixelated image with desired width / height
        image = self._reduce(image)

        # apply palette
        height, width, depth = image.shape
        reshaped = np.reshape(image, (height * width, depth))
        probs = self.model.predict_proba(reshaped)
        y = np.argmax(probs, axis=1)

        # increase hue and snap color values to multiples of 8
        palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3))
        palette[:, :, 1] *= 1.14
        palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8
        # generate recolored image
        image = palette[y]

        # apply dither over threshold if it's not zero
        if self.dither:
            # get second best probability by removing the best one
            probs[np.arange(len(y)), y] = 0
            # get new best and values
            v = np.max(probs, axis=1)
            y = np.argmax(probs, axis=1)

            # replace every second pixel with second best color
            pad = not bool(width % 2)
            for i in range(0, len(image), 2):
                if pad:
                    i += (i // width) % 2
                if v[i] > self.dither:
                    image[i] = palette[y[i]]

        image = np.reshape(image, (height, width, depth))
        return np.clip(image.astype("int"), 0, 255)

    def _reduce(self, image):
        """Apply convolutions on image ITER times and generate a smaller image
		based on the highest magnitude of gradients"""

        # self is visible to decorated function
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            # apply median filter for noise reduction
            dim = median(dim, square(4))
            for i in range(self.ITER):
                h, w = dim.shape
                h, w = h // 2, w // 2
                new_image = np.zeros((h * w)).astype("int")
                view = view_as_blocks(dim, (2, 2))
                flatten = view.reshape(-1, 2, 2)
                for i, f in enumerate(flatten):
                    conv = np.abs(
                        np.sum(np.multiply(self.CONVOLUTIONS,
                                           f.reshape(-1, 2, 2)).reshape(-1, 4),
                               axis=1))
                    new_image[i] = np.mean(f[self.SOLUTIONS[np.argmax(conv)]])
                new_image = new_image.reshape((h, w))
                dim = new_image.copy()
            return new_image

        return _wrapper(image)
def main():
    parser = argparse.ArgumentParser(
        description=
        "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)."
    )
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="Path of output directory.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="Path of config file.")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    logging.info("get args")
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")
    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    for key, value in config.items():
        logging.info(f"{key} = {value}")
    train_features = pd.read_csv("../input/lish-moa/train_features.csv")
    test_features = pd.read_csv("../input/lish-moa/test_features.csv")
    GENES = [col for col in train_features.columns if col.startswith("g-")]
    CELLS = [col for col in train_features.columns if col.startswith("c-")]
    logging.info("load data.")

    if config["norm_type"] == "RankGauss":
        train_features, test_features = apply_rank_gauss(
            train_features,
            test_features,
            columns=GENES + CELLS,
            config=config["QuantileTransformer"],
        )
        logging.info("Normalize by RankGauss.")
    elif config["norm_type"] == "zscore":
        train_features, test_features = apply_zscore(train_features,
                                                     test_features,
                                                     columns=GENES + CELLS)
        logging.info("Normalize by zscore.")

    dpgmm = BayesianGaussianMixture(**config["BayesianGaussianMixture_g"])
    dpgmm.fit(train_features[GENES])
    with open(os.path.join(args.outdir, f"dpgmm_{config['norm_type']}_g.job"),
              "wb") as f:
        joblib.dump(dpgmm, f)
    proba = dpgmm.predict_proba(train_features[GENES])
    plt.figure()
    plt.imshow(proba, aspect="auto")
    plt.title("train_dpgmm_g")
    plt.colorbar()
    plt.savefig(os.path.join(args.outdir, "train_dpgmm_g.png"))
    plt.close()
    proba = dpgmm.predict_proba(test_features[GENES])
    plt.figure()
    plt.imshow(proba, aspect="auto")
    plt.title("test_dpgmm_g")
    plt.colorbar()
    plt.savefig(os.path.join(args.outdir, "test_dpgmm_g.png"))
    plt.close()
    logging.info("finish g.")
    dpgmm = BayesianGaussianMixture(**config["BayesianGaussianMixture_c"])
    dpgmm.fit(train_features[CELLS])
    with open(os.path.join(args.outdir, f"dpgmm_{config['norm_type']}_c.job"),
              "wb") as f:
        joblib.dump(dpgmm, f)
    proba = dpgmm.predict_proba(train_features[CELLS])
    plt.figure()
    plt.imshow(proba, aspect="auto")
    plt.title("train_dpgmm_c")
    plt.colorbar()
    plt.savefig(os.path.join(args.outdir, "train_dpgmm_c.png"))
    plt.close()
    proba = dpgmm.predict_proba(test_features[CELLS])
    plt.figure()
    plt.imshow(proba, aspect="auto")
    plt.title("test_dpgmm_c")
    plt.colorbar()
    plt.savefig(os.path.join(args.outdir, "test_dpgmm_c.png"))
    plt.close()
    logging.info("finish c.")
Пример #19
0
class Pyxelate:

    CONVOLUTIONS = np.array(
        [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]],
         [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]],
         [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]],
         [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]],
         [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]],
        dtype="int")

    SOLUTIONS = np.array([
        [[1, 1], [1, 1]],
        [[0, 1], [1, 1]],
        [[1, 0], [1, 1]],
        [[1, 1], [0, 1]],
        [[1, 1], [1, 0]],
        [[1, 1], [0, 0]],
        [[0, 0], [1, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [0, 0]],
        [[0, 1], [0, 0]],
        [[0, 0], [1, 0]],
        [[0, 0], [0, 1]],
    ],
                         dtype="bool")

    ITER = 2

    def __init__(self,
                 height,
                 width,
                 color=8,
                 dither=True,
                 alpha=.6,
                 regenerate_palette=True,
                 keyframe=.6,
                 sensitivity=.07,
                 random_state=0):
        """Create instance for generating similar pixel arts."""
        self.height = int(height)
        self.width = int(width)
        if self.width < 1 or self.height < 1:
            raise ValueError("Result can not be smaller than 1x1 pixels.")
        self.color = int(color)
        if self.color < 2:
            raise ValueError("The minimum number of colors is 2.")
        elif self.color > 32:
            raise ValueError("The maximum number of colors is 32.")
        if dither:
            self.dither = 1 / (self.color + 1)
        else:
            self.dither = 0.
        self.alpha = float(alpha)  # threshold for opacity
        self.regenerate_palette = bool(regenerate_palette)
        self.keyframe = keyframe  # threshold for differences between keyframes
        self.sensitivity = sensitivity  # threshold for differences between parts of keyframes

        # BGM
        self.is_fitted = False
        self.random_state = int(random_state)
        self.model = BayesianGaussianMixture(
            n_components=self.color,
            max_iter=256,
            covariance_type="tied",
            weight_concentration_prior_type="dirichlet_distribution",
            mean_precision_prior=1. / 256.,
            warm_start=False,
            random_state=self.random_state)

    def convert(self, image):
        """Generate pixel art from image"""
        return self._convert(image, False, False)

    def _convert(self, image, override_adapthist=False, override_dither=False):
        """Generate pixel art from image or sequence of images"""
        # does the image have alpha channel?
        if self._is_transparent(image):
            # remove artifacts from transparent edges
            image = self._dilate(image)
            # create alpha mask
            mask = resize(image[:, :, 3], (self.height, self.width),
                          anti_aliasing=True)
            # mask for colors
            color_mask = resize(image[:, :, 3], (32, 32),
                                anti_aliasing=False).ravel()
        else:
            mask = None
            color_mask = None

        # apply adaptive contrast
        if not override_adapthist:
            image = self._fix_hist(image)

        # create sample for finding palette
        if self.regenerate_palette or not self.is_fitted:
            examples = resize(image[:, :, :3], (32, 32),
                              anti_aliasing=False).reshape(-1, 3).astype("int")
            if color_mask is not None:
                # transparent colors should be ignored
                examples = examples[color_mask >= self.alpha]
            self._fit_model(examples)

        # resize image to 4 times the desired width and height
        image = resize(
            image[:, :, :3],
            (self.height * self.ITER * 2, self.width * self.ITER * 2),
            anti_aliasing=True)
        # generate pixelated image with desired width / height
        image = self._reduce(image)

        # apply palette
        height, width, depth = image.shape
        reshaped = np.reshape(image, (height * width, depth))
        probs = self.model.predict_proba(reshaped)
        y = np.argmax(probs, axis=1)

        # increase hue and snap color values to multiples of 8
        palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3))
        palette[:, :, 1] *= 1.14  # empirical magic number
        palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8
        palette[palette ==
                248] = 255  # clamping // 8 * 8 would rarely allow 255 values

        # generate recolored image
        image = palette[y]

        # apply dither over threshold if it's not zero
        if not override_dither and self.dither:
            # get second best probability by removing the best one
            probs[np.arange(len(y)), y] = 0
            # get new best and values
            v = np.max(probs, axis=1) > self.dither
            y = np.argmax(probs, axis=1)

            # replace every second pixel with second best color
            pad = not bool(width % 2)
            if pad:
                # make sure to alternate between starting positions
                # bottleneck
                for i in range(0, len(image), 2):
                    i += (i // width) % 2
                    if v[i]:
                        image[i] = palette[y[i]]
            else:
                i = np.argwhere(v[::2]) * 2
                image[i] = palette[y[i]]

        image = np.reshape(image, (height, width, depth))
        if mask is not None:
            # use transparency from original image, but make it either 0 or 255
            mask[mask >= self.alpha] = 255
            mask[mask < self.alpha] = 0
            image = np.dstack(
                (image, mask))  # result has lost its alpha channel

        return np.clip(image.astype("int"), 0, 255).astype("uint8")

    def convert_sequence(self, images):
        """Generates sequence of pixel arts from a list of images"""
        try:
            _ = np.array(images, dtype=float)
        except ValueError:
            # image sizes are different == setting an array element with a sequence
            raise ValueError("Shape of images in list are different.")

        # apply adaptive histogram on each
        images = [self._fix_hist(image) for image in images]

        transparent = self._is_transparent(images[0])
        keyframe_limit = self.keyframe * np.prod(images[0].shape) * 255.
        sensitivity_limit = self.sensitivity * 255.
        diff_images, key_frames = [], []

        # create new images that are just the differences between sequences
        for image in images:
            # add first image
            if diff_images:
                diff = np.abs(image[:, :, :3] - diff_images[-1][:, :, :3])
                # image is not too different, from previous one, create mask
                if np.sum(diff) < keyframe_limit:
                    diff = resize(np.mean(diff, axis=2),
                                  (self.height, self.width),
                                  anti_aliasing=True)
                    over, under = diff > sensitivity_limit, diff <= sensitivity_limit
                    diff[over], diff[under] = 255, 0.
                    diff = resize(diff, (image.shape[0], image.shape[1]),
                                  anti_aliasing=False)
                    # was the image already transparent?
                    if transparent:
                        image[:, :, 3] = diff
                    else:
                        image = np.dstack((image, diff))
                    key_frames.append(False)
                else:
                    key_frames.append(True)
            else:
                key_frames.append(True)
            # add transparency layer for keyframes also, for easier broadcasting
            if not self._is_transparent(image):
                image = np.dstack(
                    (image, np.ones((image.shape[0], image.shape[1]))))
            diff_images.append(image)

        # create a palette from all images if possible
        if self.regenerate_palette:
            warnings.warn(
                "using regenerate_palette=True will result in flickering, as the palette will be regenerated for each image!",
                Warning)
        else:
            self._palette_from_list(diff_images)

        # merge keyframes and differences
        last = None
        for image, key in zip(diff_images, key_frames):
            current = self._convert(image, True,
                                    ~key)  # pyxelate keyframe / change
            if last is None:
                last = current
            else:
                # merge differences to previous images
                mask = ~np.logical_xor(last[:, :, 3], current[:, :, 3])
                last[mask] = current[mask]
            # generator
            yield last.copy()

    def _palette_from_list(self, images):
        """Fit model to find palette using all images in list at once"""
        transparency = self._is_transparent(images[0])
        examples = []
        color_masks = []

        # sample from all images
        for image in images:
            examples.append(
                resize(image[:, :, :3], (16, 16),
                       anti_aliasing=False).reshape(-1, 3).astype("int"))
            if transparency:
                color_masks.append(
                    resize(images[0][:, :, 3], (16, 16), anti_aliasing=False))

        # concatenate to a single matrix
        examples = np.concatenate(examples)
        if transparency:
            # transparent colors should be ignored
            color_masks = np.concatenate(color_masks).ravel()
            examples = examples[color_masks >= self.alpha]
        self._fit_model(examples)

    def _fit_model(self, X):
        """Fit model while suppressing warnings from sklearn"""
        converge = True
        with warnings.catch_warnings(record=True) as w:
            # fit model
            self.model.fit(X)
            if w and w[-1].category == ConvergenceWarning:
                warnings.filterwarnings('ignore', category=ConvergenceWarning)
                converge = False
        if not converge:
            warnings.warn(
                "the model has failed to converge, try a different number of colors for better results!",
                Warning)
        self.is_fitted = True

    def _reduce(self, image):
        """Apply convolutions on image ITER times and generate a smaller image
		based on the highest magnitude of gradients"""

        # self is visible to decorated function
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            # apply median filter for noise reduction
            dim = median(dim, square(4))
            for n in range(self.ITER):
                h, w = dim.shape
                h, w = h // 2, w // 2
                flatten = view_as_blocks(dim, (2, 2)).reshape(-1, 2, 2)
                # bottleneck
                new_image = np.fromiter(
                    (self._reduce_conv(f) for f in flatten),
                    flatten.dtype).reshape((h, w))
                if n < self.ITER - 1:
                    dim = new_image.copy()
            return new_image

        return _wrapper(image)

    def _reduce_conv(self, f):
        """The actual function that selects the right pixels based on the gradients  2x2 square"""
        return np.mean(f[self.SOLUTIONS[np.argmax(
            np.sum(np.multiply(self.CONVOLUTIONS, f.reshape(-1, 2,
                                                            2)).reshape(-1, 4),
                   axis=1))]])

    def _dilate(self, image):
        """Dilate semi-transparent edges to remove artifacts
		(unwanted edges, caused by transparent pixels having different colors)"""
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            return dilation(dim, selem=square(4))

        # use dilated pixels for semi-transparent ones
        mask = image[:, :, 3]
        alter = _wrapper(image[:, :, :3])
        image[:, :, :3][mask < self.alpha] = alter[mask < self.alpha]
        return image

    @staticmethod
    def _fix_hist(image):
        """Apply adaptive histogram"""
        image = equalize_adapthist(
            image) * 255 * 1.14  # empirical magic number
        image[image <= 8.] = 0.
        return image

    @staticmethod
    def _is_transparent(image):
        """Returns True if there is an additional dimension for transparency"""
        return bool(image.shape[2] == 4)
Пример #20
0
class FisherVectorGMM:
    """
    Fisher Vector derived from GMM
    ---
    Attributes
    -----------
    n_kernels: int
        number of kernels in GMM
    convars_type: str
        convariance type for GMM
    use_bayesian: bool
        whether or not to use Baysian GMM
    gmm: GaussianMixture() or BayesianGaussianMixture()
        GMM instance in sklearn
    means: np.array()
        means learned in GMM
    covars: np.array()
        covariance learned in GMM
    weights: np.array()
        weights learned in GMM
    ---------------------------------------
    Functions
    -----------
    fit(): public
        fit raw data into GMM
    predict(): public
        predict FV for one video (variable frames)
    predict_alternative(): public
        predict FV for one video (variable frames) alternative
        not validated
    save(): public
        save GMM model into external file
    load(): public
        load GMM model from external file
    """
    def __init__(self, n_kernels=1, convars_type='diag', use_bayesian=False):
        # para n_kernels:
        # para convars_type:
        # para use_bayesian:
        assert convars_type in ['diag', 'full']
        assert n_kernels >= 0
        # == 0 dummy instance

        self.name = 'kernels%d_convars%s_bayes%d' % (n_kernels, convars_type,
                                                     use_bayesian)
        self.n_kernels = n_kernels
        self.convars_type = convars_type
        self.use_bayesian = use_bayesian
        self.fitted = False
        self.config = json.load(open('./config/model.json',
                                     'r'))['fisher_vector']
        self.save_dir = self.config['save_dir']
        self.data_dir = self.config['data_dir']
        self.means = None
        self.covars = None
        self.weights = None

        if not self.use_bayesian:
            self.gmm = GaussianMixture(n_components=self.n_kernels,
                                       covariance_type=self.convars_type,
                                       max_iter=1000,
                                       verbose=2)
        else:
            self.gmm = BayesianGaussianMixture(
                n_components=self.n_kernels,
                covariance_type=self.convars_type,
                max_iter=1000,
                verbose=2)

    def fit(self, X):
        # para X: shape [n_frames, n_features, n_feature_dim]
        # if os.path.isfile(os.path.join(self.save_dir, self.name, 'gmm.model')):
        #     print("\nmodel already trained ---", self.name)
        #     self.load()
        #     return
        # elif not os.path.isdir(os.path.join(self.save_dir, self.name)):
        #     os.mkdir(os.path.join(self.save_dir, self.name))

        self.feature_dim = X.shape[-1]
        # X = X.reshape(-1, X.shape[-1])
        print("\nfitting data into GMM with %d kernels" % self.n_kernels)

        self.gmm.fit(X)
        self.means = self.gmm.means_
        self.covars = self.gmm.covariances_
        self.weights = self.gmm.weights_
        print("\nfitting completed")

        # if cov_type is diagonal - make sure that covars holds a diagonal matrix
        if self.convars_type == 'diag':
            cov_matrices = np.empty(shape=(self.n_kernels,
                                           self.covars.shape[1],
                                           self.covars.shape[1]))
            for i in range(self.n_kernels):
                cov_matrices[i, :, :] = np.diag(self.covars[i, :])
            self.covars = cov_matrices

        assert self.covars.ndim == 3
        print("\nmodel trained ---", self.name)
        # self.save()

    def score(self, X):
        return self.gmm.score(X.reshape(-1, X.shape[-1]))

    def predict(self, X, normalized=True):
        # para X: shape [n_frames, n_feature_dim]
        assert X.ndim == 2
        assert X.shape[
            0] >= self.n_kernels, 'n_frames should be greater than n_kernels'

        print("\ninferring fisher vectors with given GMM ...")

        X_matrix = X.reshape(-1, X.shape[-1])  # [n_frames, n_feature_dim]

        # set equal weights to predict likelihood ratio
        self.gmm.weights_ = np.ones(self.n_kernels) / self.n_kernels
        likelihood_ratio = self.gmm.predict_proba(X_matrix).reshape(
            X.shape[0], self.n_kernels)  # [n_frames, n_kernels]

        var = np.diagonal(self.covars, axis1=1,
                          axis2=2)  # [n_kernels, n_feature_dim]

        # decrease the memory use
        norm_dev_from_modes = np.tile(X[:, None, :], (1, self.n_kernels, 1))
        np.subtract(norm_dev_from_modes,
                    self.means[None, :],
                    out=norm_dev_from_modes)
        np.divide(norm_dev_from_modes, var[None, :], out=norm_dev_from_modes)
        """
        norm_dev_from_modes:
            (X - mean) / var
            [n_frames, n_kernels, n_feature_dim]
        """

        # mean deviation
        mean_dev = np.multiply(likelihood_ratio[:, :, None],
                               norm_dev_from_modes).mean(
                                   axis=0)  # [n_kernels, n_feature_dim]
        mean_dev = np.multiply(1 / np.sqrt(self.weights[:, None]),
                               mean_dev)  # [n_kernels, n_feature_dim]

        # covariance deviation
        cov_dev = np.multiply(likelihood_ratio[:, :, None],
                              norm_dev_from_modes**2 - 1).mean(
                                  axis=0)  # [n_kernels, n_feature_dim]
        cov_dev = np.multiply(1 / np.sqrt(2 * self.weights[:, None]),
                              cov_dev)  # [n_kernels, n_feature_dim]

        # stack vectors of mean and covariance
        fisher_vector = np.concatenate([mean_dev, cov_dev], axis=1)

        if normalized:
            fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign(
                fisher_vector)  # power normalization
            fisher_vector = fisher_vector / np.linalg.norm(
                fisher_vector, axis=0)  # L2 normalization

        # fisher_vector[fisher_vector < 10**-4] = 0 # threshold
        print("\ninferring completed.")

        assert fisher_vector.ndim == 2
        return fisher_vector

    def predict_alternative(self, X, normalized=True):
        X = np.atleast_2d(X)
        N = X.shape[0]

        # Compute posterior probabilities.
        Q = self.gmm.predict_proba(X)  # NxK

        # Compute the sufficient statistics of descriptors.
        Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
        Q_X = np.dot(Q.T, X) / N
        Q_XX_2 = np.dot(Q.T, X**2) / N

        # compute derivatives with respect to mixing weights, means and variances.
        d_pi = Q_sum.squeeze() - self.gmm.weights_
        d_mu = Q_X - Q_sum * self.gmm.means_
        d_sigma = (-Q_XX_2 - Q_sum * self.gmm.means_**2 +
                   Q_sum * self.gmm.covariances_ + 2 * Q_X * self.gmm.means_)

        # merge derivatives into a vector.
        fisher_vector = np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))

        if normalized:
            fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign(
                fisher_vector)  # power normalization
            fisher_vector = fisher_vector / np.linalg.norm(fisher_vector,
                                                           axis=0)  # L2 norm

        return fisher_vector

    def save(self):
        with open(os.path.join(self.save_dir, self.name, 'gmm.model'),
                  'wb') as out_gmm:
            pickle.dump(self.gmm, out_gmm, protocol=3)
        with open(os.path.join(self.save_dir, self.name, 'covars.data'),
                  'wb') as out_covars:
            pickle.dump(self.covars, out_covars, protocol=3)
        out_gmm.close()
        out_covars.close()
        print("\nmodel saved. --- ", self.name)

    def load(self):
        with open(os.path.join(self.save_dir, self.name, 'gmm.model'),
                  'rb') as in_gmm:
            self.gmm = pickle.load(in_gmm)
        with open(os.path.join(self.save_dir, self.name, 'covars.data'),
                  'rb') as in_covars:
            self.covars = pickle.load(in_covars)
        in_gmm.close()
        in_covars.close()
        if not self.use_bayesian:
            assert isinstance(self.gmm, GaussianMixture)
        else:
            assert isinstance(self.gmm, BayesianGaussianMixture)
        self.means = self.gmm.means_
        self.weights = self.gmm.weights_
        print("\nmodel loaded. --- ", self.name)

    def save_vector(self,
                    fisher_vector,
                    partition,
                    dynamics=False,
                    label=False):
        if not label:
            filename = 'vector_%s_%d' % (
                partition,
                self.n_kernels) if dynamics else 'fisher_vector_%s_%d' % (
                    partition, self.n_kernels)
            np.save(os.path.join(self.data_dir, filename), fisher_vector)
        else:
            filename = 'label_%s' % partition
            np.save(os.path.join(self.data_dir, filename), fisher_vector)

    def load_vector(self, partition, dynamics=False, label=False, bic=False):
        if not label:
            if not bic:
                filename = 'vector_%s_%d.npy' % (
                    partition, self.n_kernels
                ) if dynamics else 'fisher_vector_%s_%d.npy' % (partition,
                                                                self.n_kernels)
            else:
                filename = 'vector_%s_0.npy' % partition if dynamics else 'fisher_vector_%s_0.npy' % partition
            fisher_vector = np.load(os.path.join(self.data_dir, filename),
                                    allow_pickle=True)
            return fisher_vector
        else:
            filename = 'label_%s.npy' % partition
            label = np.load(os.path.join(self.data_dir, filename))
            return label
Пример #21
0
df[(df['Class'] == 3) & (df['Similarity'] >= 0.99)]['OBO'].mean()

df[(df['pct_1'] >= 0.95) & (df['pct_2'] >= 0.95)]['OBO'].mean()
df[(df['pct'] >= 0.95)]['OBO'].mean()

#================================================
ddgmm = BayesianGaussianMixture(
    n_components=5,
    covariance_type='full',
    weight_concentration_prior=100,
    weight_concentration_prior_type="dirichlet_distribution",
    max_iter=100,
    random_state=1337).fit(X)
pred = ddgmm.predict(X)
df_train['Class'] = pred
df_train['Class'].value_counts()

dpgmm = BayesianGaussianMixture(
    n_components=5,
    covariance_type='full',
    weight_concentration_prior=1,
    weight_concentration_prior_type='dirichlet_process',
    max_iter=100,
    random_state=1337).fit(X)
pred = dpgmm.predict(X)
df_train['Class'] = pred
df_train['Class'].value_counts()

dpgmm.predict(X_test)
dpgmm.predict_proba(X_test)
Пример #22
0
import pandas as pd
import sqlite3

from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import train_test_split

conn = sqlite3.connect('../data_collection/binary.db')

query = (
    "select s._id, s.Title, s.Artist, s.Album, s.Acousticness, s.Danceability, s.Energy, s.Instrumentalness, "
    "s.MusicalKey, s.Liveness, s.Tempo, s.Valence, t.Name as Tag from Relationship as r\n"
    "join Song as s on s._id = r.Song_id\n"
    "join Tag as t on t._id = r.Tag_id;")

df = pd.read_sql_query(query, conn)

X = df.drop(
    labels=['_id', 'Title', 'Artist', 'Album', 'MusicalKey', 'Tempo', 'Tag'],
    axis=1)
y = df['Tag']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=42)
k_value = 2
bgmm = BayesianGaussianMixture(n_components=k_value)
bgmm.fit(X_train)

probs = bgmm.predict_proba(X_test)
Пример #23
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.model == 'bgm':
        model = BayesianGaussianMixture(
            weight_concentration_prior_type="dirichlet_process",
            n_components=options.n_components)
    elif options.model == 'gaussiannb':
        model = GaussianNB()
    elif options.model == 'rfc':
        model = RandomForestClassifier(n_jobs=-1)
    elif options.model == 'svc':
        params = {'kernel': 'rbf', 'gamma': 0.5, 'C': 1, 'probability': True}
        model = SVC(**params)
    else:
        raise (
            'Model not specificied or wrong. Add for example "model: bgm" to config file.'
        )

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    sum_columns = ['delay']
    if options.reason_code_table is not None:
        sum_columns = ['count']

    logging.info('Reading data...')
    data = bq.get_rows(starttime,
                       endtime,
                       loc_col='trainstation',
                       project=options.project,
                       dataset=options.feature_dataset,
                       table=options.feature_table,
                       parameters=all_param_names,
                       reason_code_table=options.reason_code_table,
                       only_winters=options.only_winters)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=sum_columns,
                                aggs=aggs)

    # Sorting is actually not necessary. It's been useful for debugging.
    data.sort_values(by=['time', 'trainstation'], inplace=True)
    data.set_index('time', inplace=True)

    logging.info('Data contain {} rows...'.format(len(data)))

    logging.info('Adding binary class to the dataset with limit {}...'.format(
        options.delay_limit))
    #logging.info('Adding binary class to the dataset with limit {}...'.format(limit))
    #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1)
    data['class'] = data['delay'].map(lambda x: 1
                                      if x > options.delay_limit else -1)
    io.log_class_dist(data.loc[:, 'class'].values, labels=[-1, 1])

    if options.balance:
        logging.info('Balancing dataset...')
        count = data.groupby('class').size().min()
        data = pd.concat([
            data.loc[data['class'] == -1].sample(n=count),
            data.loc[data['class'] == 1].sample(n=count)
        ])
        io.log_class_dist(data.loc[:, 'class'].values, labels=[-1, 1])

    if options.month:
        logging.info('Adding month to the dataset...')
        data['month'] = data.index.map(lambda x: x.month)
        options.feature_params.append('month')

    target = data.loc[:, 'class'].astype(np.int32).values.ravel()
    features = data.loc[:, options.feature_params].astype(np.float32).values

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.3)

    if options.normalize:
        logging.info('Normalizing data...')
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    logging.debug('Features shape after pre-processing: {}'.format(
        X_train.shape))

    if options.cv:
        logging.info('Doing random search for hyper parameters...')
        if options.model == 'bgm':
            param_grid = {
                "n_components": [1, 2, 4, 8, 16],
                "covariance_type": ['full', 'tied', 'diag', 'spherical'],
                "init_params": ['kmeans', 'random']
            }
        elif options.model == 'rfc':
            raise ("Not implemented. Get back to work!")
        elif options.model == 'svc':

            features_compinations = [
                [
                    'lat', 'lon', 'pressure', 'max_temperature',
                    'min_temperature', 'mean_temperature', 'mean_dewpoint',
                    'mean_humidity', 'mean_winddirection', 'mean_windspeedms',
                    'max_windgust', 'max_precipitation1h', 'max_snowdepth',
                    'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'max_temperature', 'min_temperature',
                    'mean_temperature', 'mean_dewpoint', 'mean_humidity',
                    'mean_winddirection', 'mean_windspeedms', 'max_windgust',
                    'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis',
                    'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_windgust',
                    'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis',
                    'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_snowdepth',
                    'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_snowdepth',
                    'max_n', 'min_vis', 'min_clhb', 'max_precipitation1h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_snowdepth',
                    'min_vis', 'max_precipitation1h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_winddirection',
                    'mean_windspeedms', 'max_snowdepth', 'max_precipitation1h'
                ]
            ]

            param_grid = {
                "C": [0.0001, 0.001, 0.01, 0.1, 1],
                "kernel": ['rbf', 'poly'],
                "degree": [2, 3],
                "gamma": [0.5],
                "coef0": [0.1],
                "probability": [True],
                "features": features_compinations
            }

            from lib.svc import SVCF
            model = SVCF(all_features=options.feature_params)
        else:
            raise ("No param_grid set for given model ({})".format(
                options.model))

        print(model.get_params().keys())

        ftwo_scorer = make_scorer(fbeta_score, beta=2)
        scoring = {
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall',
            'f1': 'f1',
            'f2': ftwo_scorer
        }

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           verbose=1,
                                           scoring=scoring,
                                           refit='recall',
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)

        logging.info("RandomizedSearchCV done.")
        scores = ['accuracy', 'precision', 'recall', 'f1', 'f2']
        fname = options.output_path + '/random_search_cv_results.txt'
        io.report_cv_results(random_search.cv_results_,
                             scores=scores,
                             filename=fname,
                             ext_filename=fname)
        model = random_search.best_estimator_

        io.save_scikit_model(model,
                             filename=options.save_file,
                             ext_filename=options.save_file)
        if options.normalize:
            fname = options.save_path + '/xscaler.pkl'
            io.save_scikit_model(scaler, filename=fname, ext_filename=fname)

    else:
        logging.info('Training...')
        model.fit(X_train, y_train)

        # Save model and xscaler (no reason to save xscaler before the model has fitted as well)
        io.save_scikit_model(model,
                             filename=options.save_file,
                             ext_filename=options.save_file)
        if options.normalize:
            fname = options.save_path + '/xscaler.pkl'
            io.save_scikit_model(scaler, filename=fname, ext_filename=fname)

    # Metrics
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)
    # We want [-1,1] classes as y values are
    y_pred[y_pred == 0] = -1

    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    logging.info('Accuracy: {}'.format(acc))
    logging.info('Precision: {}'.format(precision))
    logging.info('Recall: {}'.format(recall))
    logging.info('F1 score: {}'.format(f1))
    io.log_class_dist(y_pred, labels=[-1, 1])

    error_data = {
        'acc': [acc],
        'precision': [precision],
        'recall': [recall],
        'f1': [f1]
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    # Confusion matrices
    fname = '{}/confusion_matrix_validation.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test, y_pred, np.arange(2), filename=fname)

    fname = '{}/confusion_matrix_validation_normalised.png'.format(
        options.output_path)
    viz.plot_confusion_matrix(y_test,
                              y_pred,
                              np.arange(2),
                              True,
                              filename=fname)

    # Precision-recall curve
    fname = '{}/precision-recall-curve.png'.format(options.output_path)
    viz.prec_rec_curve(y_test, y_pred_proba, filename=fname)

    # ROC
    fname = '{}/roc.png'.format(options.output_path)
    viz.plot_binary_roc(y_test, y_pred_proba, filename=fname)

    ############################################################################
    # EVALUATE
    ############################################################################
    if options.evaluate:
        logging.info('Loading test data...')
        test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"),
                                dt.datetime.strptime('2019-01-01', "%Y-%m-%d"),
                                loc_col='trainstation',
                                project=options.project,
                                dataset=options.feature_dataset,
                                table=options.test_table,
                                parameters=all_param_names)

        test_data = io.filter_train_type(labels_df=test_data,
                                         train_types=['K', 'L'],
                                         sum_types=True,
                                         train_type_column='train_type',
                                         location_column='trainstation',
                                         time_column='time',
                                         sum_columns=['delay'],
                                         aggs=aggs)

        # Sorting is actually not necessary. It's been useful for debugging.
        test_data.sort_values(by=['time', 'trainstation'], inplace=True)
        test_data.set_index('time', inplace=True)
        logging.info('Test data contain {} rows...'.format(len(test_data)))

        logging.info(
            'Adding binary class to the test dataset with limit {}...'.format(
                options.delay_limit))
        #logging.info('Adding binary class to the dataset with limit {}...'.format(limit))
        #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1)
        test_data['class'] = test_data['delay'].map(
            lambda x: 1 if x > options.delay_limit else -1)
        io.log_class_dist(test_data.loc[:, 'class'].values, labels=[-1, 1])

        if options.month:
            logging.info('Adding month to the test dataset...')
            test_data['month'] = test_data.index.map(lambda x: x.month)

        times = [('2011-02-01', '2011-03-01'), ('2016-06-01', '2016-07-01'),
                 ('2017-02-01', '2017-03-01'), ('2011-02-01', '2017-03-01')]
        for start, end in times:
            try:
                y_pred_proba, y_pred, y = predict_timerange(
                    test_data, options.feature_params, model, scaler, start,
                    end)
                perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io)
            except EmptyDataError:
                logging.info('No data for {} - {}'.format(start, end))
Пример #24
0
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC
w = w / np.sqrt(scaler.var_[1:])
# w = np.exp(-np.exp(3 * w.mean(axis=1)))
w = 1. / w.mean(axis=1) ** 2

Html_file = open("gmm_sklearn_files/gmm3_sklearn.html", "w")

gmm = BayesianGaussianMixture(n_components=3, alpha_prior=0.1, beta_prior=1,
                              n_init=5)
gmm.fit(X)  # , weights=w) not implemented in sklearn yet
preds = gmm.predict(X)
probs = gmm.predict_proba(X)

data_thr['preds'] = pd.Series(preds).astype("category")

color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
             "brown", "green", "orange"]  # Spectral9
color_key = color_key[:len(set(preds))+1]

covs = gmm.covariances_
means = gmm.means_

# transform cov for non-standardizeed data:
covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)),
                        np.dot(covs[j], np.diag(np.sqrt(scaler.var_))))
                 for j in range(covs.shape[0])])
means = np.array([scaler.inverse_transform(means[j].reshape(1, -1)).T
Пример #25
0
    def _st_smooth(self,
                   var_idx,
                   x_v,
                   y_v=None,
                   n_component=1,
                   thresh_hold=0.3,
                   dp=False):
        mixture_dist = []
        for task_idx in range(self.num_task):
            if y_v is not None:
                mean = self.params_mean[task_idx][var_idx][x_v][y_v]
                var = self.transform_var(
                    self.params_var[task_idx][var_idx][x_v][y_v])
            else:
                mean = self.params_mean[task_idx][var_idx][x_v]
                var = self.transform_var(
                    self.params_var[task_idx][var_idx][x_v])
            mixture_dist.append({'kwargs': {'loc': mean, 'scale': var}})

        alpha = 0.3
        alpha_list = [(1 - alpha) / (self.num_task - 1)] * (self.num_task - 1)
        alpha_list.append(alpha)
        sample = create_mixture(mixture_dist, alpha_list=alpha_list)
        if dp:
            gmm = DPGMM(max_iter=1000,
                        n_components=n_component,
                        covariance_type='spherical')
        else:
            gmm = GMM(max_iter=500,
                      n_components=n_component,
                      covariance_type='spherical')
        gmm.fit(sample)

        new_idx_list = []
        for task_idx in range(self.num_task):
            if y_v is not None:
                predict_probability = gmm.predict_proba(
                    np.array(
                        self.params_mean[task_idx][var_idx][x_v][y_v]).reshape(
                            -1, 1))
            else:
                predict_probability = gmm.predict_proba(
                    np.array(self.params_mean[task_idx][var_idx][x_v]).reshape(
                        -1, 1))
            f_ = True
            while f_:
                if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                    new_idx = np.argmax(predict_probability)
                    f_ = False
                else:
                    predict_probability[0][np.argmax(
                        predict_probability)] = 0.0
                    #self.num_merged_params += 1
            if new_idx in new_idx_list:
                self.num_merged_params += 1
            new_idx_list.append(new_idx)
            if y_v is not None:
                self.params_mean[task_idx][var_idx][x_v][y_v] = gmm.means_[
                    new_idx]
                self.params_var[task_idx][var_idx][x_v][
                    y_v] = self.retransform_var(gmm.covariances_[new_idx])
            else:
                self.params_mean[task_idx][var_idx][x_v] = gmm.means_[new_idx]
                self.params_var[task_idx][var_idx][x_v] = self.retransform_var(
                    gmm.covariances_[new_idx])
        """
Пример #26
0
class Pyxelate:

    CONVOLUTIONS = np.array(
        [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]],
         [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]],
         [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]],
         [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]],
         [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]],
        dtype="int")

    SOLUTIONS = np.array([
        [[1, 1], [1, 1]],
        [[0, 1], [1, 1]],
        [[1, 0], [1, 1]],
        [[1, 1], [0, 1]],
        [[1, 1], [1, 0]],
        [[1, 1], [0, 0]],
        [[0, 0], [1, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [0, 0]],
        [[0, 1], [0, 0]],
        [[0, 0], [1, 0]],
        [[0, 0], [0, 1]],
    ],
                         dtype="bool")

    ITER = 2

    def __init__(self,
                 height,
                 width,
                 color=8,
                 dither=True,
                 alpha=.6,
                 regenerate_palette=True,
                 random_state=0):
        """Create instance for generating similar pixel arts."""

        self.height = int(height)
        self.width = int(width)
        if self.width < 1 or self.height < 1:
            raise ValueError("Result can not be smaller than 1x1 pixels.")
        self.color = int(color)
        if self.color < 2:
            raise ValueError("The minimum number of colors is 2.")
        elif self.color > 32:
            raise ValueError("The maximum number of colors is 32.")
        if dither:
            self.dither = 1 / (self.color + 1)
        else:
            self.dither = 0.
        self.alpha = float(alpha)
        self.regenerate_palette = bool(regenerate_palette)

        # BGM
        self.is_fitted = False
        self.random_state = int(random_state)
        self.model = BayesianGaussianMixture(
            n_components=self.color,
            max_iter=256,
            covariance_type="tied",
            weight_concentration_prior_type="dirichlet_distribution",
            mean_precision_prior=1. / 256.,
            warm_start=False,
            random_state=self.random_state)

    def convert(self, image):
        """Generate pixel art from image"""
        # does the image have alpha channel?
        if image.shape[2] == 4:
            # remove artifacts from transparent edges
            image = self._dilate(image)
            # create alpha mask
            mask = resize(image[:, :, 3], (self.height, self.width),
                          anti_aliasing=True)
            # mask for colors
            color_mask = resize(image[:, :, 3], (32, 32),
                                anti_aliasing=False).ravel()
        else:
            mask = None
            color_mask = None

        # apply adaptive contrast
        image = equalize_adapthist(
            image) * 255 * 1.14  # empirical magic number
        image[image <= 8.] = 0.

        # create sample for finding palette
        if self.regenerate_palette or not self.is_fitted:
            examples = resize(image, (32, 32),
                              anti_aliasing=False).reshape(-1, 3).astype("int")
            if color_mask is not None:
                # transparent colors should be ignored
                examples = examples[color_mask >= self.alpha]
            self._fit_model(examples)

        # resize image to 4 times the desired width and height
        image = resize(
            image, (self.height * self.ITER * 2, self.width * self.ITER * 2),
            anti_aliasing=True)
        # generate pixelated image with desired width / height
        image = self._reduce(image)

        # apply palette
        height, width, depth = image.shape
        reshaped = np.reshape(image, (height * width, depth))
        probs = self.model.predict_proba(reshaped)
        y = np.argmax(probs, axis=1)

        # increase hue and snap color values to multiples of 8
        palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3))
        palette[:, :, 1] *= 1.14  # empirical magic number
        palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8
        palette[palette ==
                248] = 255  # clamping // 8 * 8 would rarely allow 255 values

        # generate recolored image
        image = palette[y]

        # apply dither over threshold if it's not zero
        if self.dither:
            # get second best probability by removing the best one
            probs[np.arange(len(y)), y] = 0
            # get new best and values
            v = np.max(probs, axis=1)
            y = np.argmax(probs, axis=1)

            # replace every second pixel with second best color
            pad = not bool(width % 2)
            for i in range(0, len(image), 2):
                if pad:
                    # make sure to alternate between starting positions
                    i += (i // width) % 2
                if v[i] > self.dither:
                    image[i] = palette[y[i]]

        image = np.reshape(image, (height, width, depth))
        if mask is not None:
            # use transparency from original image, but make it either 0 or 255
            mask[mask >= self.alpha] = 255
            mask[mask < self.alpha] = 0
            image = np.dstack(
                (image, mask))  # result has lost its alpha channel

        return np.clip(image.astype("int"), 0, 255).astype("uint8")

    def palette_from_list(self, images):
        """Fit model to find palette using all images in list at once"""
        if self.regenerate_palette:
            warnings.warn(
                "Warning, regenerate_palette=True will cause the generated palette to be lost while converting images!",
                Warning)
        examples = []
        color_masks = []
        transparency = bool(images[0].shape[2] == 4)
        # sample from all images
        for image in images:
            image = equalize_adapthist(
                image) * 255 * 1.14  # empirical magic number
            image[image <= 8.] = 0.
            examples.append(
                resize(image, (16, 16),
                       anti_aliasing=False).reshape(-1, 3).astype("int"))
            if transparency:
                color_masks.append(
                    resize(images[0][:, :, 3], (16, 16), anti_aliasing=False))
        # concatenate to a single matrix
        examples = np.concatenate(examples)
        if transparency:
            # transparent colors should be ignored
            color_masks = np.concatenate(color_masks).ravel()
            examples = examples[color_masks >= self.alpha]
        self._fit_model(examples)

    def _fit_model(self, X):
        """Fit model while suppressing warnings from sklearn"""
        converge = True
        with warnings.catch_warnings(record=True) as w:
            # fit model
            self.model.fit(X)
            if w and w[-1].category == ConvergenceWarning:
                warnings.filterwarnings('ignore', category=ConvergenceWarning)
                converge = False
        if not converge:
            warnings.warn(
                "The model has failed to converge, try a different number of colors for better results!",
                Warning)
        self.is_fitted = True

    def _reduce(self, image):
        """Apply convolutions on image ITER times and generate a smaller image
		based on the highest magnitude of gradients"""

        # self is visible to decorated function
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            # apply median filter for noise reduction
            dim = median(dim, square(4))
            for i in range(self.ITER):
                h, w = dim.shape
                h, w = h // 2, w // 2
                new_image = np.zeros((h * w)).astype("int")
                view = view_as_blocks(dim, (2, 2))
                flatten = view.reshape(-1, 2, 2)
                for i, f in enumerate(flatten):
                    conv = np.abs(
                        np.sum(np.multiply(self.CONVOLUTIONS,
                                           f.reshape(-1, 2, 2)).reshape(-1, 4),
                               axis=1))
                    new_image[i] = np.mean(f[self.SOLUTIONS[np.argmax(conv)]])
                new_image = new_image.reshape((h, w))
                dim = new_image.copy()
            return new_image

        return _wrapper(image)

    def _dilate(self, image):
        """Dilate semi-transparent edges to remove artifacts
		(unwanted edges, caused by transparent pixels having different colors)"""
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            return dilation(dim, selem=square(4))

        # use dilated pixels for semi-transparent ones
        mask = image[:, :, 3]
        alter = _wrapper(image[:, :, :3])
        image[:, :, :3][mask < self.alpha] = alter[mask < self.alpha]
        return image
# In[4]:

train_dataset = train.values
X = train_dataset[:, 2:]
y = train_dataset[:, 1]
y = y.astype('int')
test_dataset = test.values
X_test = test_dataset[:, 2:]
print(type(X_test))
print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape)

# In[5]:
df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']})

print('BayesianGaussianMixture begins****************')
bgm = BayesianGaussianMixture(n_components=2)
print('fitting****************')
bgm_train = bgm.fit(X, y)
print('predicting****************')
bgm_X_prediction = bgm.predict_proba(X)[:, 1]
bgm_X_test_prediction = bgm.predict_proba(X_test)[:, 1]
tr_te_concatenated = np.concatenate([bgm_X_prediction, bgm_X_test_prediction])
df['bayesian_gaussian_mixture'] = tr_te_concatenated

print('final tr_te shape', df.shape)
print(df.head())

df.to_csv('bayesian_gaussian_mixture_tr_te.csv', index=False)

print(df.head())
Пример #28
0
    def em_stereo(self,n_component=1,dp=True,thresh_hold=0.4):
        self.num_params = 0
        #The range of len(params)
        _step = 0
        for var_idx in tqdm(range(len(self.merge_var[0]))):

            for x_v in range(len(self.merge_var[0][var_idx])):
                print('Step %d'%_step,end='\r')
                _step += 1
                try:
                    
                    for y_v in range(len(self.merge_var[0][var_idx][x_v])):
                        #print('cluster weights ....%d'%var_idx)
                        dist = []
                        for task_idx in range(len(self.merge_var)):
                            nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v][y_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v][y_v])),200)
                            dist.append(nor)
                        
                        dist = np.array(np.asmatrix(np.concatenate(dist)).T)
                        if dp:
                            print('Initializing DPGMM%d ... '%_step,end='\r')
                            gmm = DPGMM( max_iter=1000,  n_components=n_component, covariance_type='spherical')
                        else:
                            gmm = GMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                        gmm.fit(dist)
                        new_idx_list = []
                        for task_idx in range(len(self.merge_var)):
                            #if dp:
                            #Strategy 1. Set threshold
                            predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1))
                            f_ = True
                            
                            while f_:
                                #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)):
                                if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                                    new_idx = np.argmax(predict_probability)
                                    f_ = False
                                else:
                                    predict_probability[0][np.argmax(predict_probability)] = 0.0
                                    self.num_params += 1
                            

                        #else:
                        #    new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1))
                        #    if new_idx in new_idx_list:
                                self.num_params += 1
                            new_idx_list.append(new_idx)
                            self.merge_var[task_idx][var_idx][x_v][y_v] = gmm.means_[new_idx]
                            self.merge_uncertainty[task_idx][var_idx][x_v][y_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)


                except TypeError:
                    dist = []
                    
                    
                    for task_idx in range(len(self.merge_var)):
                        nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v])),200)
                        dist.append(nor)
                    dist = np.array(np.asmatrix(np.concatenate(dist)).T)
                    if dp:
                        print('Initializing DPGMM%d ... '%_step,end='\r')
                        gmm = DPGMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                    else:
                        gmm = GMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                    gmm.fit(dist)
                    new_idx_list = []
                    for task_idx in range(len(self.merge_var)):
                        #if dp:
                        #Strategy 1. Set threshold
                        predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1))
                        f_ = True
                        while f_:
                            #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)):
                            if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                                new_idx = np.argmax(predict_probability)
                                f_ = False
                            else:
                                predict_probability[0][np.argmax(predict_probability)] = 0.0
                                self.num_params += 1

                    #else:
                    #    new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1))
                    #    if new_idx in new_idx_list:
                    #        self.num_params += 1
                        new_idx_list.append(new_idx)
                        self.merge_var[task_idx][var_idx][x_v] = gmm.means_[new_idx]
                        self.merge_uncertainty[task_idx][var_idx][x_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)