def do_bgm(self, n_components=6, seed=42): """Bayesian Gaussian Mixture. Infer the effective number of components in a Gaussian Mixture Model via variational Bayesian estimation. n_effective_componenents < n_components if the model sets some weights close to 0. Args: n_components (int): Number of components in GMM. seed (int): Random seed. Returns: bgm_output (dict): Labels and probabilities. """ np.random.seed(seed) bgm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', weight_concentration_prior=1e-2, weight_concentration_prior_type='dirichlet_process', mean_precision_prior=1e-2, init_params='random', max_iter=100, random_state=seed) bgm.fit(self.X) bgm_labels = bgm.predict(self.X) bgm_prob = bgm.predict_proba(self.X)[:,0] bgm_output = {'bgm_labels': bgm_labels, 'bgm_prob': bgm_prob} return bgm_output
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this method.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
def partition_data(self,args): method, j = args if method== "vi": dp = BayesianGaussianMixture(n_components = self.K,weight_concentration_prior = self.alpha, max_iter=1,init_params='kmeans',weight_concentration_prior_type='dirichlet_process') dp.fit(self.X[self.U[j]]) Z = dp.predict(self.X[self.U[j]]).astype(int) Z_star = dp.predict(self.X_star).astype(int) if method=="gmm": Z,Z_star= self.uncollapsed_dp_partition_alt(j) elif method=="kmean": km = KMeans(n_clusters=self.K) Z = km.fit_predict(self.X[self.U[j]]).astype(int) Z_star = km.predict(self.X_star[self.U[j]]).astype(int) else: Z = np.random.choice(self.K,size = self.N_minibatch,replace=True) Z_star = np.random.choice(np.unique(Z),size = self.N_star,replace=True) le = LE() le.fit(np.hstack((Z,Z_star))) Z = le.transform(Z) Z_star = le.transform(Z_star) if (method=="vi"): #& (self.vi_partition): Z_diff = np.setdiff1d(Z_star,Z) if Z_diff.size > 0: idx = np.hstack((np.where(Z_star==k) for k in Z_diff)).flatten() unique_Z = np.unique(Z) post_Z = dp.predict_proba(self.X_star[idx])[:,unique_Z] Z_star[idx] = [np.random.choice(unique_Z,p = post_Z_i / post_Z_i.sum() ) for post_Z_i in post_Z] assert(np.setdiff1d(Z_star,Z).size == 0) return(Z,Z_star)
def load(self, phipsis): self.length = len(phipsis) num_component = min(10, self.length) gm_ = GM(n_components=num_component) gm_.fit(X=phipsis) weights = gm_.weights_ to_keep = weights > 0.05 num_component = sum(to_keep) gm = GM(n_components=num_component) gm.fit(X=phipsis) precisions = gm.precisions_cholesky_ # self.means = gm.means_ self.phipsis = phipsis weight = np.mean(precisions[:, 0, 0]) \ + np.mean(precisions[:, 1, 1]) weight = weight * self.weight_scaling_factor # for matcher weight self.weight = min(weight, 1) self.weight *= self.weight_accom_factor covs = gm.covariances_ cov_invs = np.array([np.linalg.inv(cov) for cov in covs]) cluster_dist = gm.predict_proba(phipsis) self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist) self.gm = gm # for matcher weight # matcher_weight should be a product of the precision/clustering # behaviour of the distribution, and the posterior probability of the # queried point. So, higher clustering but point does not belong in # distribution => other pressures acting on queried point => should # assign lower weight. Lower clustering and point belong => low # clustering means low pressure on point, so it shouldn't matter that # much. return
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message( NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this estimator.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert adjusted_rand_score(Y, Y_pred) >= .95
def detectDoublet(args): counts_matrix = readMatrix(args.input, binary=False) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06, sim_doublet_ratio=3, n_neighbors=25) doublet_scores, _ = scrub.scrub_doublets( min_counts=1, min_cells=3, min_gene_variability_pctl=85, mean_center=True, normalize_variance=True, n_prin_comps=min(30, counts_matrix.get_shape()[0] // 10)) # Fit a Gaussian mixture model X = scrub.doublet_scores_sim_ X = np.array([X]).T gmm = BayesianGaussianMixture(n_components=2, max_iter=1000, random_state=2394).fit(X) i = np.argmax(gmm.means_) probs_sim = gmm.predict_proba(X)[:, i] vals = X[np.argwhere(probs_sim > 0.5)].flatten() if vals.size == 0: threshold = np.amax(X.flatten()) else: threshold = min(vals) X = np.array([doublet_scores]).T probs = gmm.predict_proba(X)[:, i].tolist() with open(args.output, 'w') as fl: fl.write('\t'.join(map(str, probs))) fl.write("\n") fl.write(str(threshold)) fl.write("\n") fl.write('\t'.join(map(str, (doublet_scores.tolist())))) fl.write("\n") fl.write('\t'.join(map(str, scrub.doublet_scores_sim_)))
def create_dpgmm_proba( train_features, test_features, columns, path=None, config={}, kind="g", is_concat=False, ): from sklearn.mixture import BayesianGaussianMixture if is_concat: print("Caution: You use test data to make dpgmm feature.") data = pd.concat([train_features[columns], test_features[columns]], axis=0) if path is None: dpgmm = BayesianGaussianMixture(**config) dpgmm.fit(data) else: with open(path, "rb") as f: dpgmm = joblib.load(f) proba = dpgmm.predict_proba(data) train2 = proba[: train_features.shape[0]] test2 = proba[-test_features.shape[0] :] else: if path is None: dpgmm = BayesianGaussianMixture(**config) dpgmm.fit(train_features[columns]) else: with open(path, "rb") as f: dpgmm = joblib.load(f) train2 = dpgmm.predict_proba(train_features[columns]) test2 = dpgmm.predict_proba(test_features[columns]) n_cluster = train2.shape[1] train2 = pd.DataFrame( train2, columns=[f"dpgmm_{kind}-{i}" for i in range(n_cluster)] ) test2 = pd.DataFrame(test2, columns=[f"dpgmm_{kind}-{i}" for i in range(n_cluster)]) train_features = pd.concat((train_features, train2), axis=1) test_features = pd.concat((test_features, test2), axis=1) return train_features, test_features
def load(self, phipsis): self.length = len(phipsis) if np.allclose(phipsis, np.full(phipsis.shape, 360)): self.to_skip = True return i_to_ignore = np.array(phipsis == np.array([360., 360.]))[:, 0] self.ignored_i = i_to_ignore phipsis = phipsis[~i_to_ignore] phipsi_median = np.median(phipsis, axis=0) phipsis = phipsis - phipsi_median phipsis[phipsis > 180] -= 360. phipsis[phipsis < -180] += 360. gm_ = GM(n_components=30) gm_.fit(X=phipsis) weights = gm_.weights_ to_keep = weights > 0.05 num_component = sum(to_keep) gm = GM(n_components=num_component) gm.fit(X=phipsis) precisions = gm.precisions_cholesky_ # self.means = gm.means_ self.phipsis = phipsis self.medians = phipsi_median weight = np.mean(precisions[:, 0, 0]) \ + np.mean(precisions[:, 1, 1]) weight = weight * self.weight_scaling_factor # for matcher weight self.weight = float(min(weight, 1.)) covs = gm.covariances_ cov_invs = np.array([np.linalg.inv(cov) for cov in covs]) cluster_dist = gm.predict_proba(phipsis) self.cov_dist = np.einsum("ijk, li->ljk", cov_invs, cluster_dist) self.gm = gm # for matcher weight # matcher_weight should be a product of the precision/clustering # behaviour of the distribution, and the posterior probability of the # queried point. So, higher clustering but point does not belong in # distribution => other pressures acting on queried point => should # assign lower weight. Lower clustering and point belong => low # clustering means low pressure on point, so it shouldn't matter that # much. return
class GMMTask(BaseInternalTask): """""" def __init__(self, n_components, db_fn, n_iter, tids=None, split=None, alg='em'): """""" super(GMMTask, self).__init__(n_components, db_fn, n_iter, tids, split, alg) self.A, self.doc_hash = self.read(db_fn) if alg == 'em': self.gmm = GaussianMixture(self.k, max_iter=n_iter) elif alg == 'variational': self.gmm = BayesianGaussianMixture(self.k, max_iter=n_iter) def process(self): """""" if self.tids is not None: keep_doc = OrderedDict( filter(lambda x: x[0] in self.tids, self.doc_hash.items())) self.A = self.A[keep_doc.values()] self.doc_hash = OrderedDict( zip(keep_doc.keys(), range(len(keep_doc)))) self.gmm.fit(self.A) self.U = self.gmm.predict_proba(self.A) @property def data(self): """""" return { 'item_factors': self.U, 'term_factors': self.V, 'tids': self.doc_hash.keys(), 'uids': self.term_hash.keys(), 'factor_labels': self.gmm.means_ }
def main(): infile = sys.argv[1] outfile = sys.argv[2] k = int(sys.argv[3]) data = np.genfromtxt(infile, delimiter=',') print( "Received {} points, clustering with {} mixture components and 2 inits" .format(data.shape[0], k)) if data.size > 0: scaler = StandardScaler() clusterer = BayesianGaussianMixture( k, n_init=2, ) data = scaler.fit_transform(data) converged = False while not converged: try: clusterer.fit(data) converged = True except ValueError: clusterer.n_components -= 1 print(f"Retrying with {clusterer.n_components} components.") labels = clusterer.predict_proba(data) labels = prune(clusterer, labels, 0.001) print("Finished clustering") else: labels = [] print("Insufficient data to cluster") with open(outfile, 'w') as f: for sample in labels: f.write(", ".join(map(str, sample))) f.write("\n")
plot_segmentation(Y[30:-30, 30:-30], savefn=fn_segs) '''Scikit's VB GMM''' # Initialize model model = BayesianGaussianMixture(n_components=K, max_iter=max_iter, verbose=3) # Fit model model.fit(X.reshape((-1, 1))) # Segment image Y_h = model.predict(X.reshape((-1, 1))).reshape((H, W)) # Obtain posteriors post = model.predict_proba(X.reshape((-1, 1))).reshape((H, W, K)) # Set cluster assignments to correct tissue labels Y_h = set_classes(Y_h, z) # Compute error err[0, n, r] = np.mean(Y_h[M] != Y[M]) dcc[0, n, r] = dice(Y_h[M], Y[M]) if vis: fn_segs = fn + 'SCK_segs' + str(n + 1) + '_r' + str(r + 1) + '.png' plot_segmentation(Y_h[30:-30, 30:-30], savefn=fn_segs) fn_segs = fn + 'SCK_segl' + str(n + 1) + '_r' + str(r + 1) + '.png' plot_clustering(X[30:-30, 30:-30, 0],
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42) X2 = X2 + [6, -8] X = np.r_[X1, X2] y = np.r_[y1, y2] # Train model # Setting n_components higher than needed # BGM weights zero for unnecessary clusters bgm = BayesianGaussianMixture(n_components=10, n_init=10, random_state=42) bgm.fit(X) print("EM Estimates", bgm.weights_) print("EM Means", bgm.means_[:4]) print("EM Covariances", bgm.covariances_[:3]) print("Convergance, and iterations", bgm.converged_, bgm.n_iter_) print("Hard clustering predictions", bgm.predict(X)) print("Hard clustering probabilities", bgm.predict_proba(X)[:1]) # Train models with high weight concentrations on datapoints # weight_concentration_prior 0.01, 10000; with weight prior's dictating # optimal number clusters bgm_low = BayesianGaussianMixture(n_components=10, max_iter=1000, n_init=1, weight_concentration_prior=0.01, random_state=42) bgm_high = BayesianGaussianMixture(n_components=10, max_iter=1000, n_init=1, weight_concentration_prior=10000, random_state=42) nn = 73
random_seed = 27132 n_components = 10 rng = np.random.RandomState(seed=random_seed) # fix a seed DProc = BayesianGaussianMixture( n_components=n_components, weight_concentration_prior_type="dirichlet_process", weight_concentration_prior=1e-1, n_init=7, init_params='kmeans', # default 'kmeans' random_state=random_seed # if int, then taken as random seed ).fit(Y) # random_state=random_state results = DProc.predict(Y) probs = DProc.predict_proba(Y) res_prob = np.column_stack((probs, results)) # res_prob = np.around(res_prob, decimals = 3) # around() for arrays # context manager controls precision within the next block of print commands with printoptions(precision=3, suppress=True): print(results) print("\nposterior prob:\n", probs) print("\nmean:\n", DProc.means_) print("\ncovariances\n", DProc.covariances_) print("\nweights", DProc.weights_) print("\nCount the clusters\n") print(Counter(results).keys()) # equals to list(set(words)) print(Counter(results).values()) # count freq of the elements np.savetxt('BGM_hypo4_Out.csv', results, fmt='%1.1f', delimiter=',')
def cluster_bayesian_gmm(onehot_input): bgmm = BayesianGaussianMixture(n_components=10).fit(onehot_input) input = bgmm.predict_proba(onehot_input) input = np.array(input) return input
def fit(self, X, Y, epochs, batch_size): EPOCHS = epochs BATCH_SIZE = batch_size n = len(X) XY = np.concatenate((X, Y), axis=1) #df = n - 1 self._X = X.copy() hidden_neurons = self.hidden_neurons if self.n_mixtures == -1: lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type, max_iter=10000) gmm.fit(XY) bic.append(gmm.bic(XY)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm self.n_mixtures = n_components clusterer = HDBSCAN() clusterer.fit(XY) clusterer.labels_ if len(np.unique(clusterer.labels_)) < self.n_mixtures: self.n_mixtures = len(np.unique(clusterer.labels_)) else: pass if self.gmm_boost == True: if len(np.unique(clusterer.labels_)) < self.n_mixtures: clusterer = HDBSCAN() clusterer.fit(X) clusters = clusterer.labels_ else: clusterer = best_gmm clusterer.fit(X) clusters = clusterer.predict_proba(X) self._clusterer = clusterer X = np.concatenate((X, clusters), axis=1) else: pass elif self.gmm_boost == True: clusterer1 = BayesianGaussianMixture(n_components=self.n_mixtures, covariance_type='full', max_iter=10000) clusterer1.fit(X) clusters = clusterer1.predict_proba(X) self._clusterer = clusterer1 clusterer2 = HDBSCAN() clusterer2.fit(X) if len(np.unique(clusterer2.labels_)) < self.n_mixtures: clusters = clusterer2.labels_ self._clusterer = clusterer2 else: pass X = np.concatenate((X, clusters), axis=1) else: pass self._y = Y.copy() dataset = tf.compat.v1.data.Dataset \ .from_tensor_slices((X, Y)) \ .repeat(EPOCHS).shuffle(len(X)).batch(BATCH_SIZE) iter_ = tf.compat.v1.data.make_one_shot_iterator(dataset) x, y = iter_.get_next() K = self.n_mixtures self.K = K self.x = x input_activation = self.input_activation hidden_activation = self.hidden_activation if input_activation.lower() == 'crelu': input_actv = tf.nn.crelu elif input_activation.lower() == 'relu6': input_actv = tf.nn.relu6 elif input_activation.lower() == 'elu': input_actv = tf.nn.elu elif input_activation.lower() == 'selu': input_actv = tf.nn.selu elif input_activation.lower() == 'leaky_relu': input_actv = tf.nn.leaky_relu elif input_activation.lower() == 'relu': input_actv = tf.nn.relu elif input_activation.lower() == 'swish': input_actv = tf.nn.swish elif input_activation.lower() == 'tanh': input_actv = tf.nn.tanh elif input_activation.lower() == 'linear': input_actv = None elif input_activation.lower() == 'softplus': input_actv = tf.nn.softplus elif input_activation.lower() == 'sigmoid': input_actv = tf.nn.sigmoid elif input_activation.lower() == 'softmax': input_actv = tf.nn.softmax else: input_actv = tf.nn.relu if hidden_activation.lower() == 'crelu': h_actv = tf.nn.crelu elif hidden_activation.lower() == 'relu6': h_actv = tf.nn.relu6 elif hidden_activation.lower() == 'elu': h_actv = tf.nn.elu elif hidden_activation.lower() == 'selu': h_actv = tf.nn.selu elif hidden_activation.lower() == 'leaky_relu': h_actv = tf.nn.leaky_relu elif hidden_activation.lower() == 'relu': h_actv = tf.nn.relu elif hidden_activation.lower() == 'swish': h_actv = tf.nn.swish elif hidden_activation.lower() == 'tanh': h_actv = tf.nn.tanh elif hidden_activation.lower() == 'linear': h_actv = None elif hidden_activation.lower() == 'softplus': h_actv = tf.nn.softplus elif hidden_activation.lower() == 'sigmoid': h_actv = tf.nn.sigmoid elif hidden_activation.lower() == 'softmax': h_actv = tf.nn.softmax else: h_actv = tf.nn.relu n_layer = len(hidden_neurons) if n_layer < 1: self.layer_last = tf.layers.dense(x, units=self.input_neurons, activation=input_actv) self.mu = tf.layers.dense(self.layer_last, units=K, activation=None, name="mu") self.var = tf.exp( tf.layers.dense(self.layer_last, units=K, activation=None, name="sigma")) self.pi = tf.layers.dense(self.layer_last, units=K, activation=tf.nn.softmax, name="mixing") else: self.layer_1 = tf.layers.dense(x, units=self.input_neurons, activation=input_actv) for i in range(2, n_layer + 2): n_neurons = hidden_neurons[i - 2] if i == n_layer + 1: print('last', i) string_var = 'self.layer_last = tf.layers.dense(self.layer_' + str( i - 1) + ', units=n_neurons, activation=h_actv)' else: print(i) string_var = 'self.layer_' + str( i) + ' = tf.layers.dense(self.layer_' + str( i - 1) + ', units=n_neurons, activation=h_actv)' exec(string_var) self.mu = tf.layers.dense(self.layer_last, units=K, activation=None, name="mu") self.var = tf.exp( tf.layers.dense(self.layer_last, units=K, activation=None, name="sigma")) self.pi = tf.layers.dense(self.layer_last, units=K, activation=tf.nn.softmax, name="mixing") if self.tf_mixture_family == False: #---------------- Not using TF Mixture Family ------------------------ if self.dist.lower() == 'normal': self.likelihood = tfp.distributions.Normal(loc=self.mu, scale=self.var) elif (self.dist.lower() == 'laplacian' or self.dist.lower() == 'laplace') == True: self.likelihood = tfp.distributions.Laplace(loc=self.mu, scale=self.var) elif self.dist.lower() == 'lognormal': self.likelihood = tfp.distributions.LogNormal(loc=self.mu, scale=self.var) elif self.dist.lower() == 'gamma': alpha = (self.mu**2) / self.var beta = self.var / self.mu self.likelihood = tfp.distributions.Gamma(concentration=alpha, rate=beta) else: self.likelihood = tfp.distributions.Normal(loc=self.mu, scale=self.var) self.out = self.likelihood.prob(y) self.out = tf.multiply(self.out, self.pi) self.out = tf.reduce_sum(self.out, 1, keepdims=True) self.out = -tf.log(self.out + 1e-10) self.mean_loss = tf.reduce_mean(self.out) else: # -------------------- Using TF Mixture Family ------------------------ self.mixture_distribution = tfp.distributions.Categorical( probs=self.pi) if self.dist.lower() == 'normal': self.distribution = tfp.distributions.Normal(loc=self.mu, scale=self.var) elif (self.dist.lower() == 'laplacian' or self.dist.lower() == 'laplace') == True: self.distribution = tfp.distributions.Laplace(loc=self.mu, scale=self.var) elif self.dist.lower() == 'lognormal': #self.distribution = tfp.edward2.LogNormal(loc=self.mu, scale=self.var) self.distribution = tfp.distributions.LogNormal(loc=self.mu, scale=self.var) elif self.dist.lower() == 'gamma': alpha = (self.mu**2) / self.var beta = self.var / self.mu self.distribution = tfp.distributions.Gamma( concentration=alpha, rate=beta) else: self.distribution = tfp.distributions.Normal(loc=self.mu, scale=self.var) self.likelihood = tfp.distributions.MixtureSameFamily( mixture_distribution=self.mixture_distribution, components_distribution=self.distribution) self.log_likelihood = -self.likelihood.log_prob(tf.transpose(y)) self.mean_loss = tf.reduce_mean(self.log_likelihood) # ---------------------------------------------------------------------- self.global_step = tf.Variable(0, trainable=False) if self.optimizer.lower() == 'adam': self.train_op = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'adadelta': self.train_op = tf.compat.v1.train.AdadeltaOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'adagradda': self.train_op = tf.compat.v1.train.AdagradDAOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'adagrad': self.train_op = tf.compat.v1.train.AdagradOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'ftrl': self.train_op = tf.compat.v1.train.FtrlOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'gradientdescent': self.train_op = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'momentum': self.train_op = tf.compat.v1.train.MomentumOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'proximaladagrad': self.train_op = tf.compat.v1.train.ProximalAdagradOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'proximalgradientdescent': self.train_op = tf.compat.v1.train.ProximalGradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'rmsprop': self.train_op = tf.compat.v1.train.RMSPropOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) else: self.train_op = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) self.init = tf.compat.v1.global_variables_initializer() # Initialize coefficients self.sess = tf.compat.v1.Session() self.sess.run(self.init) best_loss = 1e+10 self.stopping_step = 0 for i in range(EPOCHS * (n // BATCH_SIZE)): _, loss, mu, var, pi, x__ = self.sess.run([ self.train_op, self.mean_loss, self.mu, self.var, self.pi, self.x ]) if loss < best_loss: self.stopping_step = 0 self.best_loss = loss best_mu = mu best_var = var best_pi = pi best_mean_y = mu[:, 0] best_x = x__ best_loss = loss print("Epoch: {} Loss: {:3.3f}".format(i, loss)) else: self.stopping_step += 1 if self.stopping_step >= self.early_stopping: self.should_stop = True print("Early stopping is trigger at step: {} loss:{}".format( i, loss)) return else: pass self._mean_y_train = mu[:, 0] self._dist_mu_train = mu self._dist_var_train = var self._dist_pi_train = pi self._x_data_train = x__
print(input4bgmm.shape) #clustering grouper = BGM(n_components=nCluster) grouper.fit(input4bgmm) if tosavemodel: #restore the model pickle.dump(grouper, open(savename, 'wb')) Tprocess1 = time.time() print('\n', '## CLUSTERING RUNTIME:', Tprocess1 - Tprocess0) #Timer end #brief examination y_pred = grouper.predict(input4bgmm) y_max = np.max(y_pred) y_proba = grouper.predict_proba( input4bgmm) #probability of being a certain group #group = [(number of group members): images, group label, probability for each group] group = [[] for _ in range(y_max + 1)] id_group = [[] for _ in range(y_max + 1)] group_noise = [] #not in any group for ix in range(len(y_pred)): for ig in range(len(group)): if y_pred[ix] == ig: tmp = [ X_train[ix].reshape(imagesize[0], imagesize[1]), y_proba[ix] ] group[ig].append(tmp) id_group[ig].append(id_train[ix]) elif y_pred[ix] == -1: tmp = [
class Pyxelate: CONVOLUTIONS = np.array( [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]], [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]], [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]], [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]], [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]], dtype="int") SOLUTIONS = np.array([ [[1, 1], [1, 1]], [[0, 1], [1, 1]], [[1, 0], [1, 1]], [[1, 1], [0, 1]], [[1, 1], [1, 0]], [[1, 1], [0, 0]], [[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [0, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]], [[0, 0], [0, 1]], ], dtype="bool") ITER = 2 def __init__(self, height, width, color=8, dither=True, regenerate_palette=True, random_state=0): """Create instance for generating similar pixel arts.""" self.height = int(height) self.width = int(width) if self.width < 1 or self.height < 1: raise ValueError("Result can not be smaller than 1x1 pixels.") self.color = int(color) if self.color < 2: raise ValueError("The minimum number of colors is 2.") if dither: self.dither = 1 / (self.color + 1) else: self.dither = 0. self.regenerate_palette = bool(regenerate_palette) self.is_fitted = False self.random_state = int(random_state) self.model = BayesianGaussianMixture( n_components=self.color, max_iter=256, covariance_type="tied", weight_concentration_prior_type="dirichlet_distribution", mean_precision_prior=1. / 256., warm_start=False, random_state=self.random_state) def convert(self, image): """Generate pixel art from image""" # apply adaptive contrast image = equalize_adapthist(image) * 255 * 1.14 image[image <= 8.] = 0. # create sample for finding palette if self.regenerate_palette or not self.is_fitted: examples = resize(image, (32, 32), anti_aliasing=False).reshape(-1, 3).astype("int") self.model.fit(examples) self.is_fitted = True # resize image to 4 times the desired width and height image = resize( image, (self.height * self.ITER * 2, self.width * self.ITER * 2), anti_aliasing=True) # generate pixelated image with desired width / height image = self._reduce(image) # apply palette height, width, depth = image.shape reshaped = np.reshape(image, (height * width, depth)) probs = self.model.predict_proba(reshaped) y = np.argmax(probs, axis=1) # increase hue and snap color values to multiples of 8 palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3)) palette[:, :, 1] *= 1.14 palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8 # generate recolored image image = palette[y] # apply dither over threshold if it's not zero if self.dither: # get second best probability by removing the best one probs[np.arange(len(y)), y] = 0 # get new best and values v = np.max(probs, axis=1) y = np.argmax(probs, axis=1) # replace every second pixel with second best color pad = not bool(width % 2) for i in range(0, len(image), 2): if pad: i += (i // width) % 2 if v[i] > self.dither: image[i] = palette[y[i]] image = np.reshape(image, (height, width, depth)) return np.clip(image.astype("int"), 0, 255) def _reduce(self, image): """Apply convolutions on image ITER times and generate a smaller image based on the highest magnitude of gradients""" # self is visible to decorated function @adapt_rgb(each_channel) def _wrapper(dim): # apply median filter for noise reduction dim = median(dim, square(4)) for i in range(self.ITER): h, w = dim.shape h, w = h // 2, w // 2 new_image = np.zeros((h * w)).astype("int") view = view_as_blocks(dim, (2, 2)) flatten = view.reshape(-1, 2, 2) for i, f in enumerate(flatten): conv = np.abs( np.sum(np.multiply(self.CONVOLUTIONS, f.reshape(-1, 2, 2)).reshape(-1, 4), axis=1)) new_image[i] = np.mean(f[self.SOLUTIONS[np.argmax(conv)]]) new_image = new_image.reshape((h, w)) dim = new_image.copy() return new_image return _wrapper(image)
def main(): parser = argparse.ArgumentParser( description= "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)." ) parser.add_argument("--outdir", type=str, required=True, help="Path of output directory.") parser.add_argument("--config", type=str, required=True, help="Path of config file.") parser.add_argument("--verbose", type=int, default=1, help="verbose") args = parser.parse_args() logging.info("get args") # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) for key, value in config.items(): logging.info(f"{key} = {value}") train_features = pd.read_csv("../input/lish-moa/train_features.csv") test_features = pd.read_csv("../input/lish-moa/test_features.csv") GENES = [col for col in train_features.columns if col.startswith("g-")] CELLS = [col for col in train_features.columns if col.startswith("c-")] logging.info("load data.") if config["norm_type"] == "RankGauss": train_features, test_features = apply_rank_gauss( train_features, test_features, columns=GENES + CELLS, config=config["QuantileTransformer"], ) logging.info("Normalize by RankGauss.") elif config["norm_type"] == "zscore": train_features, test_features = apply_zscore(train_features, test_features, columns=GENES + CELLS) logging.info("Normalize by zscore.") dpgmm = BayesianGaussianMixture(**config["BayesianGaussianMixture_g"]) dpgmm.fit(train_features[GENES]) with open(os.path.join(args.outdir, f"dpgmm_{config['norm_type']}_g.job"), "wb") as f: joblib.dump(dpgmm, f) proba = dpgmm.predict_proba(train_features[GENES]) plt.figure() plt.imshow(proba, aspect="auto") plt.title("train_dpgmm_g") plt.colorbar() plt.savefig(os.path.join(args.outdir, "train_dpgmm_g.png")) plt.close() proba = dpgmm.predict_proba(test_features[GENES]) plt.figure() plt.imshow(proba, aspect="auto") plt.title("test_dpgmm_g") plt.colorbar() plt.savefig(os.path.join(args.outdir, "test_dpgmm_g.png")) plt.close() logging.info("finish g.") dpgmm = BayesianGaussianMixture(**config["BayesianGaussianMixture_c"]) dpgmm.fit(train_features[CELLS]) with open(os.path.join(args.outdir, f"dpgmm_{config['norm_type']}_c.job"), "wb") as f: joblib.dump(dpgmm, f) proba = dpgmm.predict_proba(train_features[CELLS]) plt.figure() plt.imshow(proba, aspect="auto") plt.title("train_dpgmm_c") plt.colorbar() plt.savefig(os.path.join(args.outdir, "train_dpgmm_c.png")) plt.close() proba = dpgmm.predict_proba(test_features[CELLS]) plt.figure() plt.imshow(proba, aspect="auto") plt.title("test_dpgmm_c") plt.colorbar() plt.savefig(os.path.join(args.outdir, "test_dpgmm_c.png")) plt.close() logging.info("finish c.")
class Pyxelate: CONVOLUTIONS = np.array( [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]], [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]], [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]], [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]], [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]], dtype="int") SOLUTIONS = np.array([ [[1, 1], [1, 1]], [[0, 1], [1, 1]], [[1, 0], [1, 1]], [[1, 1], [0, 1]], [[1, 1], [1, 0]], [[1, 1], [0, 0]], [[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [0, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]], [[0, 0], [0, 1]], ], dtype="bool") ITER = 2 def __init__(self, height, width, color=8, dither=True, alpha=.6, regenerate_palette=True, keyframe=.6, sensitivity=.07, random_state=0): """Create instance for generating similar pixel arts.""" self.height = int(height) self.width = int(width) if self.width < 1 or self.height < 1: raise ValueError("Result can not be smaller than 1x1 pixels.") self.color = int(color) if self.color < 2: raise ValueError("The minimum number of colors is 2.") elif self.color > 32: raise ValueError("The maximum number of colors is 32.") if dither: self.dither = 1 / (self.color + 1) else: self.dither = 0. self.alpha = float(alpha) # threshold for opacity self.regenerate_palette = bool(regenerate_palette) self.keyframe = keyframe # threshold for differences between keyframes self.sensitivity = sensitivity # threshold for differences between parts of keyframes # BGM self.is_fitted = False self.random_state = int(random_state) self.model = BayesianGaussianMixture( n_components=self.color, max_iter=256, covariance_type="tied", weight_concentration_prior_type="dirichlet_distribution", mean_precision_prior=1. / 256., warm_start=False, random_state=self.random_state) def convert(self, image): """Generate pixel art from image""" return self._convert(image, False, False) def _convert(self, image, override_adapthist=False, override_dither=False): """Generate pixel art from image or sequence of images""" # does the image have alpha channel? if self._is_transparent(image): # remove artifacts from transparent edges image = self._dilate(image) # create alpha mask mask = resize(image[:, :, 3], (self.height, self.width), anti_aliasing=True) # mask for colors color_mask = resize(image[:, :, 3], (32, 32), anti_aliasing=False).ravel() else: mask = None color_mask = None # apply adaptive contrast if not override_adapthist: image = self._fix_hist(image) # create sample for finding palette if self.regenerate_palette or not self.is_fitted: examples = resize(image[:, :, :3], (32, 32), anti_aliasing=False).reshape(-1, 3).astype("int") if color_mask is not None: # transparent colors should be ignored examples = examples[color_mask >= self.alpha] self._fit_model(examples) # resize image to 4 times the desired width and height image = resize( image[:, :, :3], (self.height * self.ITER * 2, self.width * self.ITER * 2), anti_aliasing=True) # generate pixelated image with desired width / height image = self._reduce(image) # apply palette height, width, depth = image.shape reshaped = np.reshape(image, (height * width, depth)) probs = self.model.predict_proba(reshaped) y = np.argmax(probs, axis=1) # increase hue and snap color values to multiples of 8 palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3)) palette[:, :, 1] *= 1.14 # empirical magic number palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8 palette[palette == 248] = 255 # clamping // 8 * 8 would rarely allow 255 values # generate recolored image image = palette[y] # apply dither over threshold if it's not zero if not override_dither and self.dither: # get second best probability by removing the best one probs[np.arange(len(y)), y] = 0 # get new best and values v = np.max(probs, axis=1) > self.dither y = np.argmax(probs, axis=1) # replace every second pixel with second best color pad = not bool(width % 2) if pad: # make sure to alternate between starting positions # bottleneck for i in range(0, len(image), 2): i += (i // width) % 2 if v[i]: image[i] = palette[y[i]] else: i = np.argwhere(v[::2]) * 2 image[i] = palette[y[i]] image = np.reshape(image, (height, width, depth)) if mask is not None: # use transparency from original image, but make it either 0 or 255 mask[mask >= self.alpha] = 255 mask[mask < self.alpha] = 0 image = np.dstack( (image, mask)) # result has lost its alpha channel return np.clip(image.astype("int"), 0, 255).astype("uint8") def convert_sequence(self, images): """Generates sequence of pixel arts from a list of images""" try: _ = np.array(images, dtype=float) except ValueError: # image sizes are different == setting an array element with a sequence raise ValueError("Shape of images in list are different.") # apply adaptive histogram on each images = [self._fix_hist(image) for image in images] transparent = self._is_transparent(images[0]) keyframe_limit = self.keyframe * np.prod(images[0].shape) * 255. sensitivity_limit = self.sensitivity * 255. diff_images, key_frames = [], [] # create new images that are just the differences between sequences for image in images: # add first image if diff_images: diff = np.abs(image[:, :, :3] - diff_images[-1][:, :, :3]) # image is not too different, from previous one, create mask if np.sum(diff) < keyframe_limit: diff = resize(np.mean(diff, axis=2), (self.height, self.width), anti_aliasing=True) over, under = diff > sensitivity_limit, diff <= sensitivity_limit diff[over], diff[under] = 255, 0. diff = resize(diff, (image.shape[0], image.shape[1]), anti_aliasing=False) # was the image already transparent? if transparent: image[:, :, 3] = diff else: image = np.dstack((image, diff)) key_frames.append(False) else: key_frames.append(True) else: key_frames.append(True) # add transparency layer for keyframes also, for easier broadcasting if not self._is_transparent(image): image = np.dstack( (image, np.ones((image.shape[0], image.shape[1])))) diff_images.append(image) # create a palette from all images if possible if self.regenerate_palette: warnings.warn( "using regenerate_palette=True will result in flickering, as the palette will be regenerated for each image!", Warning) else: self._palette_from_list(diff_images) # merge keyframes and differences last = None for image, key in zip(diff_images, key_frames): current = self._convert(image, True, ~key) # pyxelate keyframe / change if last is None: last = current else: # merge differences to previous images mask = ~np.logical_xor(last[:, :, 3], current[:, :, 3]) last[mask] = current[mask] # generator yield last.copy() def _palette_from_list(self, images): """Fit model to find palette using all images in list at once""" transparency = self._is_transparent(images[0]) examples = [] color_masks = [] # sample from all images for image in images: examples.append( resize(image[:, :, :3], (16, 16), anti_aliasing=False).reshape(-1, 3).astype("int")) if transparency: color_masks.append( resize(images[0][:, :, 3], (16, 16), anti_aliasing=False)) # concatenate to a single matrix examples = np.concatenate(examples) if transparency: # transparent colors should be ignored color_masks = np.concatenate(color_masks).ravel() examples = examples[color_masks >= self.alpha] self._fit_model(examples) def _fit_model(self, X): """Fit model while suppressing warnings from sklearn""" converge = True with warnings.catch_warnings(record=True) as w: # fit model self.model.fit(X) if w and w[-1].category == ConvergenceWarning: warnings.filterwarnings('ignore', category=ConvergenceWarning) converge = False if not converge: warnings.warn( "the model has failed to converge, try a different number of colors for better results!", Warning) self.is_fitted = True def _reduce(self, image): """Apply convolutions on image ITER times and generate a smaller image based on the highest magnitude of gradients""" # self is visible to decorated function @adapt_rgb(each_channel) def _wrapper(dim): # apply median filter for noise reduction dim = median(dim, square(4)) for n in range(self.ITER): h, w = dim.shape h, w = h // 2, w // 2 flatten = view_as_blocks(dim, (2, 2)).reshape(-1, 2, 2) # bottleneck new_image = np.fromiter( (self._reduce_conv(f) for f in flatten), flatten.dtype).reshape((h, w)) if n < self.ITER - 1: dim = new_image.copy() return new_image return _wrapper(image) def _reduce_conv(self, f): """The actual function that selects the right pixels based on the gradients 2x2 square""" return np.mean(f[self.SOLUTIONS[np.argmax( np.sum(np.multiply(self.CONVOLUTIONS, f.reshape(-1, 2, 2)).reshape(-1, 4), axis=1))]]) def _dilate(self, image): """Dilate semi-transparent edges to remove artifacts (unwanted edges, caused by transparent pixels having different colors)""" @adapt_rgb(each_channel) def _wrapper(dim): return dilation(dim, selem=square(4)) # use dilated pixels for semi-transparent ones mask = image[:, :, 3] alter = _wrapper(image[:, :, :3]) image[:, :, :3][mask < self.alpha] = alter[mask < self.alpha] return image @staticmethod def _fix_hist(image): """Apply adaptive histogram""" image = equalize_adapthist( image) * 255 * 1.14 # empirical magic number image[image <= 8.] = 0. return image @staticmethod def _is_transparent(image): """Returns True if there is an additional dimension for transparency""" return bool(image.shape[2] == 4)
class FisherVectorGMM: """ Fisher Vector derived from GMM --- Attributes ----------- n_kernels: int number of kernels in GMM convars_type: str convariance type for GMM use_bayesian: bool whether or not to use Baysian GMM gmm: GaussianMixture() or BayesianGaussianMixture() GMM instance in sklearn means: np.array() means learned in GMM covars: np.array() covariance learned in GMM weights: np.array() weights learned in GMM --------------------------------------- Functions ----------- fit(): public fit raw data into GMM predict(): public predict FV for one video (variable frames) predict_alternative(): public predict FV for one video (variable frames) alternative not validated save(): public save GMM model into external file load(): public load GMM model from external file """ def __init__(self, n_kernels=1, convars_type='diag', use_bayesian=False): # para n_kernels: # para convars_type: # para use_bayesian: assert convars_type in ['diag', 'full'] assert n_kernels >= 0 # == 0 dummy instance self.name = 'kernels%d_convars%s_bayes%d' % (n_kernels, convars_type, use_bayesian) self.n_kernels = n_kernels self.convars_type = convars_type self.use_bayesian = use_bayesian self.fitted = False self.config = json.load(open('./config/model.json', 'r'))['fisher_vector'] self.save_dir = self.config['save_dir'] self.data_dir = self.config['data_dir'] self.means = None self.covars = None self.weights = None if not self.use_bayesian: self.gmm = GaussianMixture(n_components=self.n_kernels, covariance_type=self.convars_type, max_iter=1000, verbose=2) else: self.gmm = BayesianGaussianMixture( n_components=self.n_kernels, covariance_type=self.convars_type, max_iter=1000, verbose=2) def fit(self, X): # para X: shape [n_frames, n_features, n_feature_dim] # if os.path.isfile(os.path.join(self.save_dir, self.name, 'gmm.model')): # print("\nmodel already trained ---", self.name) # self.load() # return # elif not os.path.isdir(os.path.join(self.save_dir, self.name)): # os.mkdir(os.path.join(self.save_dir, self.name)) self.feature_dim = X.shape[-1] # X = X.reshape(-1, X.shape[-1]) print("\nfitting data into GMM with %d kernels" % self.n_kernels) self.gmm.fit(X) self.means = self.gmm.means_ self.covars = self.gmm.covariances_ self.weights = self.gmm.weights_ print("\nfitting completed") # if cov_type is diagonal - make sure that covars holds a diagonal matrix if self.convars_type == 'diag': cov_matrices = np.empty(shape=(self.n_kernels, self.covars.shape[1], self.covars.shape[1])) for i in range(self.n_kernels): cov_matrices[i, :, :] = np.diag(self.covars[i, :]) self.covars = cov_matrices assert self.covars.ndim == 3 print("\nmodel trained ---", self.name) # self.save() def score(self, X): return self.gmm.score(X.reshape(-1, X.shape[-1])) def predict(self, X, normalized=True): # para X: shape [n_frames, n_feature_dim] assert X.ndim == 2 assert X.shape[ 0] >= self.n_kernels, 'n_frames should be greater than n_kernels' print("\ninferring fisher vectors with given GMM ...") X_matrix = X.reshape(-1, X.shape[-1]) # [n_frames, n_feature_dim] # set equal weights to predict likelihood ratio self.gmm.weights_ = np.ones(self.n_kernels) / self.n_kernels likelihood_ratio = self.gmm.predict_proba(X_matrix).reshape( X.shape[0], self.n_kernels) # [n_frames, n_kernels] var = np.diagonal(self.covars, axis1=1, axis2=2) # [n_kernels, n_feature_dim] # decrease the memory use norm_dev_from_modes = np.tile(X[:, None, :], (1, self.n_kernels, 1)) np.subtract(norm_dev_from_modes, self.means[None, :], out=norm_dev_from_modes) np.divide(norm_dev_from_modes, var[None, :], out=norm_dev_from_modes) """ norm_dev_from_modes: (X - mean) / var [n_frames, n_kernels, n_feature_dim] """ # mean deviation mean_dev = np.multiply(likelihood_ratio[:, :, None], norm_dev_from_modes).mean( axis=0) # [n_kernels, n_feature_dim] mean_dev = np.multiply(1 / np.sqrt(self.weights[:, None]), mean_dev) # [n_kernels, n_feature_dim] # covariance deviation cov_dev = np.multiply(likelihood_ratio[:, :, None], norm_dev_from_modes**2 - 1).mean( axis=0) # [n_kernels, n_feature_dim] cov_dev = np.multiply(1 / np.sqrt(2 * self.weights[:, None]), cov_dev) # [n_kernels, n_feature_dim] # stack vectors of mean and covariance fisher_vector = np.concatenate([mean_dev, cov_dev], axis=1) if normalized: fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign( fisher_vector) # power normalization fisher_vector = fisher_vector / np.linalg.norm( fisher_vector, axis=0) # L2 normalization # fisher_vector[fisher_vector < 10**-4] = 0 # threshold print("\ninferring completed.") assert fisher_vector.ndim == 2 return fisher_vector def predict_alternative(self, X, normalized=True): X = np.atleast_2d(X) N = X.shape[0] # Compute posterior probabilities. Q = self.gmm.predict_proba(X) # NxK # Compute the sufficient statistics of descriptors. Q_sum = np.sum(Q, 0)[:, np.newaxis] / N Q_X = np.dot(Q.T, X) / N Q_XX_2 = np.dot(Q.T, X**2) / N # compute derivatives with respect to mixing weights, means and variances. d_pi = Q_sum.squeeze() - self.gmm.weights_ d_mu = Q_X - Q_sum * self.gmm.means_ d_sigma = (-Q_XX_2 - Q_sum * self.gmm.means_**2 + Q_sum * self.gmm.covariances_ + 2 * Q_X * self.gmm.means_) # merge derivatives into a vector. fisher_vector = np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten())) if normalized: fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign( fisher_vector) # power normalization fisher_vector = fisher_vector / np.linalg.norm(fisher_vector, axis=0) # L2 norm return fisher_vector def save(self): with open(os.path.join(self.save_dir, self.name, 'gmm.model'), 'wb') as out_gmm: pickle.dump(self.gmm, out_gmm, protocol=3) with open(os.path.join(self.save_dir, self.name, 'covars.data'), 'wb') as out_covars: pickle.dump(self.covars, out_covars, protocol=3) out_gmm.close() out_covars.close() print("\nmodel saved. --- ", self.name) def load(self): with open(os.path.join(self.save_dir, self.name, 'gmm.model'), 'rb') as in_gmm: self.gmm = pickle.load(in_gmm) with open(os.path.join(self.save_dir, self.name, 'covars.data'), 'rb') as in_covars: self.covars = pickle.load(in_covars) in_gmm.close() in_covars.close() if not self.use_bayesian: assert isinstance(self.gmm, GaussianMixture) else: assert isinstance(self.gmm, BayesianGaussianMixture) self.means = self.gmm.means_ self.weights = self.gmm.weights_ print("\nmodel loaded. --- ", self.name) def save_vector(self, fisher_vector, partition, dynamics=False, label=False): if not label: filename = 'vector_%s_%d' % ( partition, self.n_kernels) if dynamics else 'fisher_vector_%s_%d' % ( partition, self.n_kernels) np.save(os.path.join(self.data_dir, filename), fisher_vector) else: filename = 'label_%s' % partition np.save(os.path.join(self.data_dir, filename), fisher_vector) def load_vector(self, partition, dynamics=False, label=False, bic=False): if not label: if not bic: filename = 'vector_%s_%d.npy' % ( partition, self.n_kernels ) if dynamics else 'fisher_vector_%s_%d.npy' % (partition, self.n_kernels) else: filename = 'vector_%s_0.npy' % partition if dynamics else 'fisher_vector_%s_0.npy' % partition fisher_vector = np.load(os.path.join(self.data_dir, filename), allow_pickle=True) return fisher_vector else: filename = 'label_%s.npy' % partition label = np.load(os.path.join(self.data_dir, filename)) return label
df[(df['Class'] == 3) & (df['Similarity'] >= 0.99)]['OBO'].mean() df[(df['pct_1'] >= 0.95) & (df['pct_2'] >= 0.95)]['OBO'].mean() df[(df['pct'] >= 0.95)]['OBO'].mean() #================================================ ddgmm = BayesianGaussianMixture( n_components=5, covariance_type='full', weight_concentration_prior=100, weight_concentration_prior_type="dirichlet_distribution", max_iter=100, random_state=1337).fit(X) pred = ddgmm.predict(X) df_train['Class'] = pred df_train['Class'].value_counts() dpgmm = BayesianGaussianMixture( n_components=5, covariance_type='full', weight_concentration_prior=1, weight_concentration_prior_type='dirichlet_process', max_iter=100, random_state=1337).fit(X) pred = dpgmm.predict(X) df_train['Class'] = pred df_train['Class'].value_counts() dpgmm.predict(X_test) dpgmm.predict_proba(X_test)
import pandas as pd import sqlite3 from sklearn.mixture import BayesianGaussianMixture from sklearn.model_selection import train_test_split conn = sqlite3.connect('../data_collection/binary.db') query = ( "select s._id, s.Title, s.Artist, s.Album, s.Acousticness, s.Danceability, s.Energy, s.Instrumentalness, " "s.MusicalKey, s.Liveness, s.Tempo, s.Valence, t.Name as Tag from Relationship as r\n" "join Song as s on s._id = r.Song_id\n" "join Tag as t on t._id = r.Tag_id;") df = pd.read_sql_query(query, conn) X = df.drop( labels=['_id', 'Title', 'Artist', 'Album', 'MusicalKey', 'Tempo', 'Tag'], axis=1) y = df['Tag'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) k_value = 2 bgmm = BayesianGaussianMixture(n_components=k_value) bgmm.fit(X_train) probs = bgmm.predict_proba(X_test)
def main(): """ Get data from db and save it as csv """ bq = BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io) starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) if options.model == 'bgm': model = BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", n_components=options.n_components) elif options.model == 'gaussiannb': model = GaussianNB() elif options.model == 'rfc': model = RandomForestClassifier(n_jobs=-1) elif options.model == 'svc': params = {'kernel': 'rbf', 'gamma': 0.5, 'C': 1, 'probability': True} model = SVC(**params) else: raise ( 'Model not specificied or wrong. Add for example "model: bgm" to config file.' ) if options.pca: ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) sum_columns = ['delay'] if options.reason_code_table is not None: sum_columns = ['count'] logging.info('Reading data...') data = bq.get_rows(starttime, endtime, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, reason_code_table=options.reason_code_table, only_winters=options.only_winters) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=sum_columns, aggs=aggs) # Sorting is actually not necessary. It's been useful for debugging. data.sort_values(by=['time', 'trainstation'], inplace=True) data.set_index('time', inplace=True) logging.info('Data contain {} rows...'.format(len(data))) logging.info('Adding binary class to the dataset with limit {}...'.format( options.delay_limit)) #logging.info('Adding binary class to the dataset with limit {}...'.format(limit)) #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1) data['class'] = data['delay'].map(lambda x: 1 if x > options.delay_limit else -1) io.log_class_dist(data.loc[:, 'class'].values, labels=[-1, 1]) if options.balance: logging.info('Balancing dataset...') count = data.groupby('class').size().min() data = pd.concat([ data.loc[data['class'] == -1].sample(n=count), data.loc[data['class'] == 1].sample(n=count) ]) io.log_class_dist(data.loc[:, 'class'].values, labels=[-1, 1]) if options.month: logging.info('Adding month to the dataset...') data['month'] = data.index.map(lambda x: x.month) options.feature_params.append('month') target = data.loc[:, 'class'].astype(np.int32).values.ravel() features = data.loc[:, options.feature_params].astype(np.float32).values X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3) if options.normalize: logging.info('Normalizing data...') scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) logging.debug('Features shape after pre-processing: {}'.format( X_train.shape)) if options.cv: logging.info('Doing random search for hyper parameters...') if options.model == 'bgm': param_grid = { "n_components": [1, 2, 4, 8, 16], "covariance_type": ['full', 'tied', 'diag', 'spherical'], "init_params": ['kmeans', 'random'] } elif options.model == 'rfc': raise ("Not implemented. Get back to work!") elif options.model == 'svc': features_compinations = [ [ 'lat', 'lon', 'pressure', 'max_temperature', 'min_temperature', 'mean_temperature', 'mean_dewpoint', 'mean_humidity', 'mean_winddirection', 'mean_windspeedms', 'max_windgust', 'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h' ], [ 'pressure', 'max_temperature', 'min_temperature', 'mean_temperature', 'mean_dewpoint', 'mean_humidity', 'mean_winddirection', 'mean_windspeedms', 'max_windgust', 'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h' ], [ 'pressure', 'min_temperature', 'mean_dewpoint', 'mean_winddirection', 'mean_windspeedms', 'max_windgust', 'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h' ], [ 'pressure', 'min_temperature', 'mean_dewpoint', 'mean_winddirection', 'mean_windspeedms', 'max_snowdepth', 'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h' ], [ 'pressure', 'min_temperature', 'mean_dewpoint', 'mean_winddirection', 'mean_windspeedms', 'max_snowdepth', 'max_n', 'min_vis', 'min_clhb', 'max_precipitation1h' ], [ 'pressure', 'min_temperature', 'mean_dewpoint', 'mean_winddirection', 'mean_windspeedms', 'max_snowdepth', 'min_vis', 'max_precipitation1h' ], [ 'pressure', 'min_temperature', 'mean_winddirection', 'mean_windspeedms', 'max_snowdepth', 'max_precipitation1h' ] ] param_grid = { "C": [0.0001, 0.001, 0.01, 0.1, 1], "kernel": ['rbf', 'poly'], "degree": [2, 3], "gamma": [0.5], "coef0": [0.1], "probability": [True], "features": features_compinations } from lib.svc import SVCF model = SVCF(all_features=options.feature_params) else: raise ("No param_grid set for given model ({})".format( options.model)) print(model.get_params().keys()) ftwo_scorer = make_scorer(fbeta_score, beta=2) scoring = { 'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall', 'f1': 'f1', 'f2': ftwo_scorer } random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), verbose=1, scoring=scoring, refit='recall', n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") scores = ['accuracy', 'precision', 'recall', 'f1', 'f2'] fname = options.output_path + '/random_search_cv_results.txt' io.report_cv_results(random_search.cv_results_, scores=scores, filename=fname, ext_filename=fname) model = random_search.best_estimator_ io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.normalize: fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(scaler, filename=fname, ext_filename=fname) else: logging.info('Training...') model.fit(X_train, y_train) # Save model and xscaler (no reason to save xscaler before the model has fitted as well) io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.normalize: fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(scaler, filename=fname, ext_filename=fname) # Metrics y_pred_proba = model.predict_proba(X_test) y_pred = np.argmax(y_pred_proba, axis=1) # We want [-1,1] classes as y values are y_pred[y_pred == 0] = -1 acc = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='binary') recall = recall_score(y_test, y_pred, average='binary') f1 = f1_score(y_test, y_pred, average='binary') logging.info('Accuracy: {}'.format(acc)) logging.info('Precision: {}'.format(precision)) logging.info('Recall: {}'.format(recall)) logging.info('F1 score: {}'.format(f1)) io.log_class_dist(y_pred, labels=[-1, 1]) error_data = { 'acc': [acc], 'precision': [precision], 'recall': [recall], 'f1': [f1] } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) # Confusion matrices fname = '{}/confusion_matrix_validation.png'.format(options.output_path) viz.plot_confusion_matrix(y_test, y_pred, np.arange(2), filename=fname) fname = '{}/confusion_matrix_validation_normalised.png'.format( options.output_path) viz.plot_confusion_matrix(y_test, y_pred, np.arange(2), True, filename=fname) # Precision-recall curve fname = '{}/precision-recall-curve.png'.format(options.output_path) viz.prec_rec_curve(y_test, y_pred_proba, filename=fname) # ROC fname = '{}/roc.png'.format(options.output_path) viz.plot_binary_roc(y_test, y_pred_proba, filename=fname) ############################################################################ # EVALUATE ############################################################################ if options.evaluate: logging.info('Loading test data...') test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"), dt.datetime.strptime('2019-01-01', "%Y-%m-%d"), loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.test_table, parameters=all_param_names) test_data = io.filter_train_type(labels_df=test_data, train_types=['K', 'L'], sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) # Sorting is actually not necessary. It's been useful for debugging. test_data.sort_values(by=['time', 'trainstation'], inplace=True) test_data.set_index('time', inplace=True) logging.info('Test data contain {} rows...'.format(len(test_data))) logging.info( 'Adding binary class to the test dataset with limit {}...'.format( options.delay_limit)) #logging.info('Adding binary class to the dataset with limit {}...'.format(limit)) #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1) test_data['class'] = test_data['delay'].map( lambda x: 1 if x > options.delay_limit else -1) io.log_class_dist(test_data.loc[:, 'class'].values, labels=[-1, 1]) if options.month: logging.info('Adding month to the test dataset...') test_data['month'] = test_data.index.map(lambda x: x.month) times = [('2011-02-01', '2011-03-01'), ('2016-06-01', '2016-07-01'), ('2017-02-01', '2017-03-01'), ('2011-02-01', '2017-03-01')] for start, end in times: try: y_pred_proba, y_pred, y = predict_timerange( test_data, options.feature_params, model, scaler, start, end) perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io) except EmptyDataError: logging.info('No data for {} - {}'.format(start, end))
scaler = StandardScaler() X = scaler.fit_transform(X) # 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC w = w / np.sqrt(scaler.var_[1:]) # w = np.exp(-np.exp(3 * w.mean(axis=1))) w = 1. / w.mean(axis=1) ** 2 Html_file = open("gmm_sklearn_files/gmm3_sklearn.html", "w") gmm = BayesianGaussianMixture(n_components=3, alpha_prior=0.1, beta_prior=1, n_init=5) gmm.fit(X) # , weights=w) not implemented in sklearn yet preds = gmm.predict(X) probs = gmm.predict_proba(X) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] # Spectral9 color_key = color_key[:len(set(preds))+1] covs = gmm.covariances_ means = gmm.means_ # transform cov for non-standardizeed data: covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)), np.dot(covs[j], np.diag(np.sqrt(scaler.var_)))) for j in range(covs.shape[0])]) means = np.array([scaler.inverse_transform(means[j].reshape(1, -1)).T
def _st_smooth(self, var_idx, x_v, y_v=None, n_component=1, thresh_hold=0.3, dp=False): mixture_dist = [] for task_idx in range(self.num_task): if y_v is not None: mean = self.params_mean[task_idx][var_idx][x_v][y_v] var = self.transform_var( self.params_var[task_idx][var_idx][x_v][y_v]) else: mean = self.params_mean[task_idx][var_idx][x_v] var = self.transform_var( self.params_var[task_idx][var_idx][x_v]) mixture_dist.append({'kwargs': {'loc': mean, 'scale': var}}) alpha = 0.3 alpha_list = [(1 - alpha) / (self.num_task - 1)] * (self.num_task - 1) alpha_list.append(alpha) sample = create_mixture(mixture_dist, alpha_list=alpha_list) if dp: gmm = DPGMM(max_iter=1000, n_components=n_component, covariance_type='spherical') else: gmm = GMM(max_iter=500, n_components=n_component, covariance_type='spherical') gmm.fit(sample) new_idx_list = [] for task_idx in range(self.num_task): if y_v is not None: predict_probability = gmm.predict_proba( np.array( self.params_mean[task_idx][var_idx][x_v][y_v]).reshape( -1, 1)) else: predict_probability = gmm.predict_proba( np.array(self.params_mean[task_idx][var_idx][x_v]).reshape( -1, 1)) f_ = True while f_: if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax( predict_probability)] = 0.0 #self.num_merged_params += 1 if new_idx in new_idx_list: self.num_merged_params += 1 new_idx_list.append(new_idx) if y_v is not None: self.params_mean[task_idx][var_idx][x_v][y_v] = gmm.means_[ new_idx] self.params_var[task_idx][var_idx][x_v][ y_v] = self.retransform_var(gmm.covariances_[new_idx]) else: self.params_mean[task_idx][var_idx][x_v] = gmm.means_[new_idx] self.params_var[task_idx][var_idx][x_v] = self.retransform_var( gmm.covariances_[new_idx]) """
class Pyxelate: CONVOLUTIONS = np.array( [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]], [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]], [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]], [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]], [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]], dtype="int") SOLUTIONS = np.array([ [[1, 1], [1, 1]], [[0, 1], [1, 1]], [[1, 0], [1, 1]], [[1, 1], [0, 1]], [[1, 1], [1, 0]], [[1, 1], [0, 0]], [[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [0, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]], [[0, 0], [0, 1]], ], dtype="bool") ITER = 2 def __init__(self, height, width, color=8, dither=True, alpha=.6, regenerate_palette=True, random_state=0): """Create instance for generating similar pixel arts.""" self.height = int(height) self.width = int(width) if self.width < 1 or self.height < 1: raise ValueError("Result can not be smaller than 1x1 pixels.") self.color = int(color) if self.color < 2: raise ValueError("The minimum number of colors is 2.") elif self.color > 32: raise ValueError("The maximum number of colors is 32.") if dither: self.dither = 1 / (self.color + 1) else: self.dither = 0. self.alpha = float(alpha) self.regenerate_palette = bool(regenerate_palette) # BGM self.is_fitted = False self.random_state = int(random_state) self.model = BayesianGaussianMixture( n_components=self.color, max_iter=256, covariance_type="tied", weight_concentration_prior_type="dirichlet_distribution", mean_precision_prior=1. / 256., warm_start=False, random_state=self.random_state) def convert(self, image): """Generate pixel art from image""" # does the image have alpha channel? if image.shape[2] == 4: # remove artifacts from transparent edges image = self._dilate(image) # create alpha mask mask = resize(image[:, :, 3], (self.height, self.width), anti_aliasing=True) # mask for colors color_mask = resize(image[:, :, 3], (32, 32), anti_aliasing=False).ravel() else: mask = None color_mask = None # apply adaptive contrast image = equalize_adapthist( image) * 255 * 1.14 # empirical magic number image[image <= 8.] = 0. # create sample for finding palette if self.regenerate_palette or not self.is_fitted: examples = resize(image, (32, 32), anti_aliasing=False).reshape(-1, 3).astype("int") if color_mask is not None: # transparent colors should be ignored examples = examples[color_mask >= self.alpha] self._fit_model(examples) # resize image to 4 times the desired width and height image = resize( image, (self.height * self.ITER * 2, self.width * self.ITER * 2), anti_aliasing=True) # generate pixelated image with desired width / height image = self._reduce(image) # apply palette height, width, depth = image.shape reshaped = np.reshape(image, (height * width, depth)) probs = self.model.predict_proba(reshaped) y = np.argmax(probs, axis=1) # increase hue and snap color values to multiples of 8 palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3)) palette[:, :, 1] *= 1.14 # empirical magic number palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8 palette[palette == 248] = 255 # clamping // 8 * 8 would rarely allow 255 values # generate recolored image image = palette[y] # apply dither over threshold if it's not zero if self.dither: # get second best probability by removing the best one probs[np.arange(len(y)), y] = 0 # get new best and values v = np.max(probs, axis=1) y = np.argmax(probs, axis=1) # replace every second pixel with second best color pad = not bool(width % 2) for i in range(0, len(image), 2): if pad: # make sure to alternate between starting positions i += (i // width) % 2 if v[i] > self.dither: image[i] = palette[y[i]] image = np.reshape(image, (height, width, depth)) if mask is not None: # use transparency from original image, but make it either 0 or 255 mask[mask >= self.alpha] = 255 mask[mask < self.alpha] = 0 image = np.dstack( (image, mask)) # result has lost its alpha channel return np.clip(image.astype("int"), 0, 255).astype("uint8") def palette_from_list(self, images): """Fit model to find palette using all images in list at once""" if self.regenerate_palette: warnings.warn( "Warning, regenerate_palette=True will cause the generated palette to be lost while converting images!", Warning) examples = [] color_masks = [] transparency = bool(images[0].shape[2] == 4) # sample from all images for image in images: image = equalize_adapthist( image) * 255 * 1.14 # empirical magic number image[image <= 8.] = 0. examples.append( resize(image, (16, 16), anti_aliasing=False).reshape(-1, 3).astype("int")) if transparency: color_masks.append( resize(images[0][:, :, 3], (16, 16), anti_aliasing=False)) # concatenate to a single matrix examples = np.concatenate(examples) if transparency: # transparent colors should be ignored color_masks = np.concatenate(color_masks).ravel() examples = examples[color_masks >= self.alpha] self._fit_model(examples) def _fit_model(self, X): """Fit model while suppressing warnings from sklearn""" converge = True with warnings.catch_warnings(record=True) as w: # fit model self.model.fit(X) if w and w[-1].category == ConvergenceWarning: warnings.filterwarnings('ignore', category=ConvergenceWarning) converge = False if not converge: warnings.warn( "The model has failed to converge, try a different number of colors for better results!", Warning) self.is_fitted = True def _reduce(self, image): """Apply convolutions on image ITER times and generate a smaller image based on the highest magnitude of gradients""" # self is visible to decorated function @adapt_rgb(each_channel) def _wrapper(dim): # apply median filter for noise reduction dim = median(dim, square(4)) for i in range(self.ITER): h, w = dim.shape h, w = h // 2, w // 2 new_image = np.zeros((h * w)).astype("int") view = view_as_blocks(dim, (2, 2)) flatten = view.reshape(-1, 2, 2) for i, f in enumerate(flatten): conv = np.abs( np.sum(np.multiply(self.CONVOLUTIONS, f.reshape(-1, 2, 2)).reshape(-1, 4), axis=1)) new_image[i] = np.mean(f[self.SOLUTIONS[np.argmax(conv)]]) new_image = new_image.reshape((h, w)) dim = new_image.copy() return new_image return _wrapper(image) def _dilate(self, image): """Dilate semi-transparent edges to remove artifacts (unwanted edges, caused by transparent pixels having different colors)""" @adapt_rgb(each_channel) def _wrapper(dim): return dilation(dim, selem=square(4)) # use dilated pixels for semi-transparent ones mask = image[:, :, 3] alter = _wrapper(image[:, :, :3]) image[:, :, :3][mask < self.alpha] = alter[mask < self.alpha] return image
# In[4]: train_dataset = train.values X = train_dataset[:, 2:] y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) # In[5]: df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) print('BayesianGaussianMixture begins****************') bgm = BayesianGaussianMixture(n_components=2) print('fitting****************') bgm_train = bgm.fit(X, y) print('predicting****************') bgm_X_prediction = bgm.predict_proba(X)[:, 1] bgm_X_test_prediction = bgm.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate([bgm_X_prediction, bgm_X_test_prediction]) df['bayesian_gaussian_mixture'] = tr_te_concatenated print('final tr_te shape', df.shape) print(df.head()) df.to_csv('bayesian_gaussian_mixture_tr_te.csv', index=False) print(df.head())
def em_stereo(self,n_component=1,dp=True,thresh_hold=0.4): self.num_params = 0 #The range of len(params) _step = 0 for var_idx in tqdm(range(len(self.merge_var[0]))): for x_v in range(len(self.merge_var[0][var_idx])): print('Step %d'%_step,end='\r') _step += 1 try: for y_v in range(len(self.merge_var[0][var_idx][x_v])): #print('cluster weights ....%d'%var_idx) dist = [] for task_idx in range(len(self.merge_var)): nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v][y_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v][y_v])),200) dist.append(nor) dist = np.array(np.asmatrix(np.concatenate(dist)).T) if dp: print('Initializing DPGMM%d ... '%_step,end='\r') gmm = DPGMM( max_iter=1000, n_components=n_component, covariance_type='spherical') else: gmm = GMM( max_iter=200, n_components=n_component, covariance_type='spherical') gmm.fit(dist) new_idx_list = [] for task_idx in range(len(self.merge_var)): #if dp: #Strategy 1. Set threshold predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1)) f_ = True while f_: #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)): if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax(predict_probability)] = 0.0 self.num_params += 1 #else: # new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1)) # if new_idx in new_idx_list: self.num_params += 1 new_idx_list.append(new_idx) self.merge_var[task_idx][var_idx][x_v][y_v] = gmm.means_[new_idx] self.merge_uncertainty[task_idx][var_idx][x_v][y_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0) except TypeError: dist = [] for task_idx in range(len(self.merge_var)): nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v])),200) dist.append(nor) dist = np.array(np.asmatrix(np.concatenate(dist)).T) if dp: print('Initializing DPGMM%d ... '%_step,end='\r') gmm = DPGMM( max_iter=200, n_components=n_component, covariance_type='spherical') else: gmm = GMM( max_iter=200, n_components=n_component, covariance_type='spherical') gmm.fit(dist) new_idx_list = [] for task_idx in range(len(self.merge_var)): #if dp: #Strategy 1. Set threshold predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1)) f_ = True while f_: #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)): if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax(predict_probability)] = 0.0 self.num_params += 1 #else: # new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1)) # if new_idx in new_idx_list: # self.num_params += 1 new_idx_list.append(new_idx) self.merge_var[task_idx][var_idx][x_v] = gmm.means_[new_idx] self.merge_uncertainty[task_idx][var_idx][x_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)