示例#1
0
 def transform(self, X, n_components=None):
   # ====== check number of components ====== #
   # specified percentage of explained variance
   if n_components is not None:
     # percentage of variances
     if n_components < 1.:
       _ = np.cumsum(self.explained_variance_ratio_)
       n_components = (_ > n_components).nonzero()[0][0] + 1
     # specific number of components
     else:
       n_components = int(n_components)
   # ====== other info ====== #
   n = X.shape[0]
   if self.batch_size is None:
     batch_size = 12 * len(self.mean_)
   else:
     batch_size = self.batch_size
   # ====== start transforming ====== #
   X_transformed = []
   for start, end in minibatch(n=n, batch_size=batch_size):
     x = super(MiniBatchPCA, self).transform(X=X[start:end])
     if n_components is not None:
       x = x[:, :n_components]
     X_transformed.append(x)
   return np.concatenate(X_transformed, axis=0)
示例#2
0
def is_binary(x: np.ndarray):
  r""" A binary array only contain 0 or 1 """
  for s, e in minibatch(batch_size=1024, n=len(x)):
    y = x[s:e]
    if np.all(np.unique(y) != (0., 1.)):
      return False
  return True
示例#3
0
def is_discrete(x: np.ndarray):
  r""" A discrete array contain only integer values """
  if not isinstance(x.dtype, np.integer):
    for s, e in minibatch(batch_size=1024, n=len(x)):
      y = x[s:e]
      if np.any(y.astype(np.int32) != y.astype(np.float32)):
        return False
  return True
示例#4
0
def sparsity_percentage(x, batch_size=1024):
  n_zeros = 0
  n_total = np.prod(x.shape)
  for start, end in minibatch(batch_size=batch_size, n=x.shape[0], seed=None):
    y = x[start:end]
    if hasattr(y, 'count_nonzero'):
      n_nonzeros = y.count_nonzero()
    else:
      n_nonzeros = np.count_nonzero(y)
    n_zeros += np.prod(y.shape) - n_nonzeros
  return n_zeros / n_total
示例#5
0
def make_dnn_prediction(functions, X, batch_size=256, title=''):
    return_list = True
    if not isinstance(functions, (tuple, list)):
        functions = [functions]
        return_list = False
    n_functions = len(functions)
    results = [[] for i in range(n_functions)]
    # ====== prepare progress bar ====== #
    n_samples = len(X)
    prog = Progbar(target=n_samples,
                   print_summary=True,
                   name="Making prediction: %s" % str(title))
    # ====== for feeder ====== #
    if isinstance(X, F.Feeder):
        y_true = []
        for x, y in X.set_batch(batch_size=batch_size):
            for res, fn in zip(results, functions):
                res.append(fn(x))
            prog.add(x.shape[0])
            y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y)
        results = [np.concatenate(res, axis=0) for res in results]
        y_true = np.concatenate(y_true, axis=0)
        if return_list:
            return results, y_true
        return results[0], y_true
    # ====== for numpy array ====== #
    else:
        for start, end in minibatch(batch_size=batch_size, n=n_samples):
            y = X[start:end]
            for res, fn in zip(results, functions):
                res.append(fn(y))
            prog.add(end - start)
        results = [np.concatenate(res, axis=0) for res in results]
        if return_list:
            return results
        return results[0]
示例#6
0
def evaluate(model, ds, args):
    test = ds.create_dataset('test', batch_size=32)
    # === 1. marginalized llk
    n_mcmc = 100
    llk = []
    kl = []
    for x in tqdm(test.take(10)):
        qz = model.encode(x, training=False)
        batch_llk = []
        for s, e in minibatch(8, n_mcmc):
            n = e - s
            z = qz.sample(n)
            z = tf.reshape(z, (-1, z.shape[-1]))
            px = model.decode(z, training=False)
            # llk
            batch_llk.append(
                tf.reshape(px.log_prob(tf.tile(x, (n, 1, 1, 1))), (n, -1)))
            # kl
            exit()
        batch_llk = tf.concat(batch_llk, 0)
        llk.append(batch_llk)
    llk = tf.concat(llk, axis=-1)
    print(llk.shape)
    print('LLK:', tf.reduce_mean(tf.reduce_logsumexp(llk, 0)))
示例#7
0
def fast_kmeans(
    X,
    *,
    n_clusters: int = 8,
    max_iter: int = 300,
    tol: float = 0.0001,
    n_init: int = 10,
    random_state: int = 1,
    init: Literal['scalable-kmeans++', 'k-means||',
                  'random'] = 'scalable-k-means++',
    oversampling_factor: float = 2.0,
    max_samples_per_batch: int = 32768,
    framework: Literal['auto', 'cuml', 'sklearn'] = 'auto',
) -> MiniBatchKMeans:
    """KMeans clustering

  Parameters
  ----------
  n_clusters : int (default = 8)
      The number of centroids or clusters you want.
  max_iter : int (default = 300)
      The more iterations of EM, the more accurate, but slower.
  tol : float64 (default = 1e-4)
      Stopping criterion when centroid means do not change much.
  random_state : int (default = 1)
      If you want results to be the same when you restart Python, select a
      state.
  init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray}
         (default = 'scalable-k-means++')
      'scalable-k-means++' or 'k-means||': Uses fast and stable scalable
      kmeans++ intialization.
      'random': Choose 'n_cluster' observations (rows) at random from data
      for the initial centroids.
      If an ndarray is passed, it should be of
      shape (n_clusters, n_features) and gives the initial centers.
  max_samples_per_batch : int maximum number of samples to use for each batch
                              of the pairwise distance computation.
  oversampling_factor : int (default = 2) The amount of points to sample
      in scalable k-means++ initialization for potential centroids.
      Increasing this value can lead to better initial centroids at the
      cost of memory. The total number of centroids sampled in scalable
      k-means++ is oversampling_factor * n_clusters * 8.
  max_samples_per_batch : int (default = 32768) The number of data
      samples to use for batches of the pairwise distance computation.
      This computation is done throughout both fit predict. The default
      should suit most cases. The total number of elements in the batched
      pairwise distance computation is max_samples_per_batch * n_clusters.
      It might become necessary to lower this number when n_clusters
      becomes prohibitively large.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    kwargs.pop('framework')
    ## fine-tuning the kwargs
    cuml = _check_cuml(framework)
    if cuml:
        from cuml.cluster import KMeans
        kwargs.pop('n_init')
    else:
        kwargs.pop('oversampling_factor')
        kwargs.pop('max_samples_per_batch')
        if kwargs['init'] in ('scalable-k-means++', 'k-means||'):
            kwargs['init'] = 'k-means++'
    ## fitting
    if not cuml:
        from odin.utils import minibatch
        kmean = MiniBatchKMeans(**kwargs)
        for s, e in minibatch(int(max_samples_per_batch),
                              n=X.shape[0],
                              seed=random_state):
            kmean.partial_fit(X[s:e])
    else:
        kmean = KMeans(verbose=False, **kwargs)
        kmean.fit(X)
    return kmean
示例#8
0
def fast_pca(
    *x,
    n_components: Optional[int] = None,
    algo: Literal['pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'] = 'pca',
    y=None,
    batch_size: int = 1024,
    return_model: bool = False,
    random_state: int = 1,
):
  r""" A shortcut for many different PCA algorithms

  Arguments:
    x : {list, tuple}
      list of matrices for transformation, the first matrix will
      be used for training
    n_components : {None, int}
      number of PCA components
    algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
      different PCA algorithm:
        'ipca' - IncrementalPCA,
        'ppca' - Probabilistic PCA,
        'sppca' - Supervised Probabilistic PCA,
        'plda' - Probabilistic LDA,
        'rpca' - randomized PCA using randomized SVD
        'pca'  - Normal PCA
    y : {numpy.ndarray, None}
      required for labels in case of `sppca`
    batch_size : int (default: 1024)
      batch size, only used for IncrementalPCA
    return_model : bool (default: False)
      if True, return the trained PCA model as the FIRST return
  """
  try:
    from cuml.decomposition import PCA as cuPCA
  except ImportError:
    cuPCA = None

  batch_size = int(batch_size)
  algo = str(algo).lower()
  if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
    raise ValueError("`algo` must be one of the following: 'pca', "
                     "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" %
                     algo)
  if algo in ('sppca', 'plda') and y is None:
    raise RuntimeError("`y` must be not None if `algo='sppca'`")
  x = flatten_list(x, level=None)
  # ====== check input ====== #
  x_train = x[0]
  x_test = x[1:]
  input_shape = None
  if x_train.ndim > 2:  # only 2D for PCA
    input_shape = (-1,) + x_train.shape[1:]
    new_shape = (-1, np.prod(input_shape[1:]))
    x_train = np.reshape(x_train, new_shape)
    x_test = [np.reshape(x, new_shape) for x in x_test]
    if n_components is not None:  # no need to reshape back
      input_shape = None
  # ====== train PCA ====== #
  if algo == 'sppca':
    pca = SupervisedPPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'plda':
    from odin.ml import PLDA
    pca = PLDA(n_phi=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'pca':
    if x_train.shape[1] > 1000 and x_train.shape[0] > 1e5 and cuPCA is not None:
      pca = cuPCA(n_components=n_components, random_state=random_state)
    else:
      pca = PCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  elif algo == 'rpca':
    # we copy the implementation of RandomizedPCA because
    # it is significantly faster than PCA(svd_solver='randomize')
    pca = RandomizedPCA(n_components=n_components,
                        iterated_power=2,
                        random_state=random_state)
    pca.fit(x_train)
  elif algo == 'ipca':
    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    prog = Progbar(target=x_train.shape[0],
                   print_report=False,
                   print_summary=False,
                   name="Fitting PCA")
    for start, end in minibatch(batch_size=batch_size,
                                n=x_train.shape[0],
                                seed=1234):
      pca.partial_fit(x_train[start:end], check_input=False)
      prog.add(end - start)
  elif algo == 'ppca':
    pca = PPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  # ====== transform ====== #
  x_train = pca.transform(x_train)
  x_test = [pca.transform(x) for x in x_test]
  # reshape back to original shape if necessary
  if input_shape is not None:
    x_train = np.reshape(x_train, input_shape)
    x_test = [np.reshape(x, input_shape) for x in x_test]
  # return the results
  if len(x_test) == 0:
    return x_train if not return_model else (pca, x_train)
  if return_model:
    return tuple([pca, x_train] + x_test)
  del pca
  return tuple([x_train] + x_test)
示例#9
0
def evaluate(vae: VariationalAutoencoder,
             ds: ImageDataset,
             expdir: str,
             title: str,
             batch_size: int = 64,
             take_count: int = -1,
             n_images: int = 36,
             seed: int = 1):
    n_rows = int(np.sqrt(n_images))
    is_semi = vae.is_semi_supervised()
    is_hierarchical = vae.is_hierarchical()
    ds_kw = dict(batch_size=batch_size, label_percent=1.0, shuffle=False)
    ## prepare
    rand = np.random.RandomState(seed=seed)
    if not os.path.exists(expdir):
        os.makedirs(expdir)
    ## data for training semi-supervised
    train = ds.create_dataset('train', **ds_kw)
    (llkx_train, llky_train, x_org_train, x_rec_train, y_true_train,
     y_pred_train, z_train, pz_train) = _call(vae,
                                              ds=train,
                                              rand=rand,
                                              take_count=take_count,
                                              n_images=n_images,
                                              verbose=True)
    ## data for testing
    test = ds.create_dataset('test', **ds_kw)
    (llkx_test, llky_test, x_org_test, x_rec_test, y_true_test, y_pred_test,
     z_test, pz_test) = _call(vae,
                              ds=test,
                              rand=rand,
                              take_count=take_count,
                              n_images=n_images,
                              verbose=True)
    # === 0. plotting latent-factor pairs
    for idx, z in enumerate(z_test):
        z = z.mean()
        f = y_true_test
        corr_mat = Correlation.Spearman(z, f)  # [n_latents, n_factors]
        plot_latents_pairs(z, f, corr_mat, ds.labels)
        vs.plot_save(f'{expdir}/latent{idx}_factor.pdf', dpi=100, verbose=True)
    # === 0. latent traverse plot
    x_travs = x_org_test
    if x_travs.ndim == 3:  # grayscale image
        x_travs = np.expand_dims(x_travs, -1)
    else:  # color image
        x_travs = np.transpose(x_travs, (0, 2, 3, 1))
    x_travs = x_travs[rand.permutation(x_travs.shape[0])]
    n_visual_samples = 5
    n_traverse_points = 21
    n_top_latents = 10
    plt.figure(figsize=(8, 3 * n_visual_samples))
    for i in range(n_visual_samples):
        images = vae.sample_traverse(x_travs[i:i + 1],
                                     min_val=-np.min(z_test[0].mean()),
                                     max_val=np.max(z_test[0].mean()),
                                     n_best_latents=n_top_latents,
                                     n_traverse_points=n_traverse_points,
                                     mode='linear')
        images = as_tuple(images)[0]
        images = _prepare_images(images.mean().numpy(), normalize=True)
        vs.plot_images(images,
                       grids=(n_top_latents, n_traverse_points),
                       ax=(n_visual_samples, 1, i + 1))
        if i == 0:
            plt.title('Latents traverse')
    plt.tight_layout()
    vs.plot_save(f'{expdir}/latents_traverse.pdf', dpi=180, verbose=True)
    # === 0. prior sampling plot
    images = as_tuple(vae.sample_observation(n=n_images, seed=seed))[0]
    images = _prepare_images(images.mean().numpy(), normalize=True)
    plt.figure(figsize=(5, 5))
    vs.plot_images(images, grids=(n_rows, n_rows), title='Sampled')
    # === 1. reconstruction plot
    plt.figure(figsize=(15, 15))
    vs.plot_images(x_org_train,
                   grids=(n_rows, n_rows),
                   ax=(2, 2, 1),
                   title='[Train]Original')
    vs.plot_images(x_rec_train,
                   grids=(n_rows, n_rows),
                   ax=(2, 2, 2),
                   title='[Train]Reconstructed')
    vs.plot_images(x_org_test,
                   grids=(n_rows, n_rows),
                   ax=(2, 2, 3),
                   title='[Test]Original')
    vs.plot_images(x_rec_test,
                   grids=(n_rows, n_rows),
                   ax=(2, 2, 4),
                   title='[Test]Reconstructed')
    plt.tight_layout()
    ## prepare the labels
    label_type = ds.label_type
    if label_type == 'categorical':
        labels_name = ds.labels
        true = np.argmax(y_true_test, axis=-1)
        labels_true = np.array([labels_name[i] for i in true])
        labels_pred = labels_true
        if is_semi:
            pred = np.argmax(y_pred_test.mean().numpy(), axis=-1)
            labels_pred = np.array([labels_name[i] for i in pred])
    elif label_type == 'factor':  # dsprites, shapes3d
        labels_name = ['cube', 'cylinder', 'sphere', 'round'] \
          if 'shapes3d' in ds.name else ['square', 'ellipse', 'heart']
        true = y_true_test[:, 2].astype('int32')
        labels_true = np.array([labels_name[i] for i in true])
        labels_pred = labels_true
        if is_semi:
            pred = get_ymean(y_pred_test)[:, 2].astype('int32')
            labels_pred = np.array([labels_name[i] for i in pred])
    else:  # CelebA
        raise NotImplementedError
    ## confusion matrix
    if is_semi:
        plt.figure(figsize=(8, 8))
        acc = accuracy_score(y_true=true, y_pred=pred)
        vs.plot_confusion_matrix(cm=confusion_matrix(y_true=true, y_pred=pred),
                                 labels=labels_name,
                                 cbar=True,
                                 fontsize=10,
                                 title=f'{title} Acc:{acc:.2f}')
    ## save arrays for later inspections
    with open(f'{expdir}/arrays', 'wb') as f:
        pickle.dump(
            dict(z_train=z_train,
                 y_pred_train=y_pred_train,
                 y_true_train=y_true_train,
                 z_test=z_test,
                 y_pred_test=y_pred_test,
                 y_true_test=y_true_test,
                 labels=labels_name,
                 ds=ds.name,
                 label_type=label_type), f)
    print(f'Exported arrays to "{expdir}/arrays"')
    ## semi-supervised
    z_mean_train = np.concatenate(
        [z.mean().numpy().reshape(z.batch_shape[0], -1) for z in z_train], -1)
    z_mean_test = np.concatenate(
        [z.mean().numpy().reshape(z.batch_shape[0], -1) for z in z_test], -1)
    # === 2. scatter points latents plot
    n_points = 5000
    ids = rand.permutation(len(labels_true))[:n_points]
    Y_true = labels_true[ids]
    Y_pred = labels_pred[ids]
    # tsne plot
    n_latents = 0 if len(z_train) == 1 else len(z_train)
    for name, X in zip(
        ['all'] + [f'latents{i}'
                   for i in range(n_latents)], [z_mean_test[ids]] +
        [z_test[i].mean().numpy()[ids] for i in range(n_latents)]):
        print(f'Plot scatter points for {name}')
        X = X.reshape(X.shape[0], -1)  # flatten to 2D
        X = Pipeline([('zscore', StandardScaler()),
                      ('pca', PCA(min(X.shape[1], 512),
                                  random_state=seed))]).fit_transform(X)
        tsne = DimReduce.TSNE(X, n_components=2, framework='sklearn')
        kw = dict(x=tsne[:, 0], y=tsne[:, 1], grid=False, size=12.0, alpha=0.8)
        plt.figure(figsize=(12, 6))
        vs.plot_scatter(color=Y_true,
                        title=f'[True]{title}-{name}',
                        ax=(1, 2, 1),
                        **kw)
        vs.plot_scatter(color=Y_pred,
                        title=f'[Pred]{title}-{name}',
                        ax=(1, 2, 2),
                        **kw)
    ## save all plot
    vs.plot_save(f'{expdir}/analysis.pdf', dpi=180, verbose=True)

    # === 3. show the latents statistics
    n_latents = len(z_train)
    colors = sns.color_palette(n_colors=len(labels_true))
    styles = dict(grid=False,
                  ticks_off=False,
                  alpha=0.6,
                  xlabel='mean',
                  ylabel='stddev')

    # scatter between latents and labels (assume categorical distribution)
    def _show_latents_labels(Z, Y, title):
        plt.figure(figsize=(5 * n_latents, 5), dpi=150)
        for idx, z in enumerate(Z):
            if len(z.batch_shape) == 0:
                mean = np.repeat(np.expand_dims(z.mean(), 0), Y.shape[0], 0)
                stddev = z.sample(Y.shape[0]) - mean
            else:
                mean = flatten(z.mean())
                stddev = flatten(z.stddev())
            y = np.argmax(Y, axis=-1)
            data = [[], [], []]
            for y_i, c in zip(np.unique(y), colors):
                mask = (y == y_i)
                data[0].append(np.mean(mean[mask], 0))
                data[1].append(np.mean(stddev[mask], 0))
                data[2].append([labels_true[y_i]] * mean.shape[1])
            vs.plot_scatter(
                x=np.concatenate(data[0], 0),
                y=np.concatenate(data[1], 0),
                color=np.concatenate(data[2], 0),
                ax=(1, n_latents, idx + 1),
                size=15 if mean.shape[1] < 128 else 8,
                title=f'[Test-{title}]#{idx} - {mean.shape[1]} (units)',
                **styles)
        plt.tight_layout()

    # simple scatter mean-stddev each latents
    def _show_latents(Z, title):
        plt.figure(figsize=(3.5 * n_latents, 3.5), dpi=150)
        for idx, z in enumerate(Z):
            mean = flatten(z.mean())
            stddev = flatten(z.stddev())
            if mean.ndim == 2:
                mean = np.mean(mean, 0)
                stddev = np.mean(stddev, 0)
            vs.plot_scatter(
                x=mean,
                y=stddev,
                ax=(1, n_latents, idx + 1),
                size=15 if len(mean) < 128 else 8,
                title=f'[Test-{title}]#{idx} - {len(mean)} (units)',
                **styles)

    _show_latents_labels(z_test, y_true_test, 'post')
    _show_latents_labels(pz_test, y_true_test, 'prior')
    _show_latents(z_test, 'post')
    _show_latents(pz_test, 'prior')

    # KL statistics
    vs.plot_figure()
    for idx, (qz, pz) in enumerate(zip(z_test, pz_test)):
        kl = []
        qz = Normal(loc=qz.mean(), scale=qz.stddev(), name=f'posterior{idx}')
        pz = Normal(loc=pz.mean(), scale=pz.stddev(), name=f'prior{idx}')
        for s, e in minibatch(batch_size=8, n=100):
            z = qz.sample(e - s)
            # don't do this in GPU, it explodes!
            kl.append((qz.log_prob(z) - pz.log_prob(z)).numpy())
        kl = np.concatenate(kl, 0)  # (mcmc, batch, event)
        # per sample
        kl_samples = np.sum(kl, as_tuple(list(range(2, kl.ndim))))
        kl_samples = logsumexp(kl_samples, 0)
        plt.subplot(n_latents, 2, idx * 2 + 1)
        sns.histplot(kl_samples, bins=50)
        plt.title(f'Z#{idx} KL per sample (nats)')
        # per latent
        kl_latents = np.mean(flatten(logsumexp(kl, 0)), 0)
        plt.subplot(n_latents, 2, idx * 2 + 2)
        plt.plot(np.sort(kl_latents))
        plt.title(f'Z#{idx} KL per dim (nats)')
    plt.tight_layout()

    vs.plot_save(f'{expdir}/latents.pdf', dpi=180, verbose=True)
示例#10
0
# ===========================================================================
update_ops = K.optimizers.Adam(lr=0.001).minimize(loss)
K.initialize_all_variables()
# ====== intitalize ====== #
record_train_loss = []
record_valid_loss = []
patience = 3
epoch = 0
# We want the rate to go up but the distortion to go down
while True:
    # ====== training ====== #
    train_losses = []
    prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch)
    start_time = timeit.default_timer()
    for start, end in minibatch(batch_size=args.bs,
                                n=X_train.shape[0],
                                seed=K.get_rng().randint(10e8)):
        _ = K.eval(loss,
                   feed_dict={X: X_train[start:end]},
                   update_after=update_ops)
        prog.add(end - start)
        train_losses.append(_)
    # ====== training log ====== #
    print(ctext("[Epoch %d]" % epoch, 'yellow'),
          '%.2f(s)' % (timeit.default_timer() - start_time))
    print("[Training set] Loss: %.4f" % np.mean(train_losses))
    # ====== validation set ====== #
    code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
    print("[Valid set]    Loss: %.4f" % lo)
    # ====== record the history ====== #
    record_train_loss.append(np.mean(train_losses))