Пример #1
0
 def __init__(self,
              algorithm='count',
              vocab_size: int = 2000,
              min_frequency: int = 2,
              max_frequency: float = 0.95,
              max_length: int = 500,
              cache_path: str = "~/nlp_data/newsgroup20",
              **kwargs):
     categorices = kwargs.pop('categorices', None)
     super().__init__(algorithm=algorithm,
                      vocab_size=vocab_size,
                      min_frequency=min_frequency,
                      max_frequency=max_frequency,
                      max_length=max_length,
                      cache_path=cache_path,
                      **kwargs)
     kw = dict(shuffle=True,
               random_state=1,
               categories=categorices,
               remove=('headers', 'footers', 'quotes'))
     data = fetch_20newsgroups(subset='train', return_X_y=False, **kw)
     X_train, y_train = data.data, data.target
     labels_name = data.target_names
     self.X_test, y_test = fetch_20newsgroups(subset='test',
                                              return_X_y=True,
                                              **kw)
     self.X_train, self.X_valid, y_train, y_valid = train_test_split(
         X_train, y_train, test_size=0.2, shuffle=True, random_state=0)
     self._labels = np.array(labels_name)
     self.y_train = one_hot(y_train, len(self._labels))
     self.y_valid = one_hot(y_valid, len(self._labels))
     self.y_test = one_hot(y_test, len(self._labels))
Пример #2
0
  def create_dataset(self,
                     batch_size=64,
                     drop_remainder=False,
                     shuffle=1000,
                     prefetch=tf.data.experimental.AUTOTUNE,
                     cache='',
                     parallel=None,
                     partition='train',
                     inc_labels=False,
                     seed=1) -> tf.data.Dataset:
    r"""
    Arguments:
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 28, 28, 1))`
        label - `(tf.float32, (None, 10))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
    X, y = get_partition(partition,
                         train=(self.X_train, self.y_train),
                         valid=(self.X_valid, self.y_valid),
                         test=(self.X_test, self.y_test))
    inc_labels = float(inc_labels)
    gen = tf.random.experimental.Generator.from_seed(seed=seed)
    assert X.shape[0] == y.shape[0]
    X = np.reshape(X, (-1, 3, 32, 32))
    X = np.transpose(X, (0, 2, 3, 1))
    y = one_hot(y, self.n_labels)

    def _process(*data):
      image = tf.cast(data[0], tf.float32)
      image = self.normalize_255(image)
      if inc_labels:
        label = tf.cast(data[1], tf.float32)
        if 0. < inc_labels < 1.:  # semi-supervised mask
          mask = gen.uniform(shape=(1,)) < inc_labels
          return dict(inputs=(image, label), mask=mask)
        return image, label
      return image

    ds = tf.data.Dataset.from_tensor_slices(X)
    if inc_labels > 0.:
      ds = tf.data.Dataset.zip((ds, tf.data.Dataset.from_tensor_slices(y)))
    ds = ds.map(_process, parallel)
    if cache is not None:
      ds = ds.cache(str(cache))
    # shuffle must be called after cache
    if shuffle is not None and shuffle > 0:
      ds = ds.shuffle(int(shuffle))
    ds = ds.batch(batch_size, drop_remainder)
    if prefetch is not None:
      ds = ds.prefetch(prefetch)
    return ds
Пример #3
0
 def sample_prior(self, n: int = 1, seed: int = 1, **kwargs) -> tf.Tensor:
   classes = self.classes
   y = np.concatenate([one_hot(np.mod(np.arange(n), i), i) for i in classes],
                      -1)
   z = super(M2VAE, self).sample_prior(n=n, seed=seed, **kwargs)
   z.qy_x = y
   return z
Пример #4
0
def _load_scale_dataset(path, dsname):
    url = str(
        base64.decodebytes(
            b'aHR0cHM6Ly9haS1kYXRhc2V0cy5zMy5hbWF6b25hd3MuY29tL3NjYWxlX2RhdGFzZXRzLnppcA==\n'
        ), 'utf-8')
    md5 = r"5fc7c52108220e30a04f033e355716c0"
    path = os.path.abspath(os.path.expanduser(path))
    if not os.path.exists(path):
        os.makedirs(path)
    filename = os.path.basename(url)
    filepath = os.path.join(path, filename)
    # download
    if not os.path.exists(filepath):
        print(f"Downloading {url} ...")
        urlretrieve(url, filename=filepath)
    # extract
    zip_path = os.path.join(path, 'scale_datasets')
    if not os.path.exists(zip_path):
        with zipfile.ZipFile(filepath, "r") as f:
            f.extractall(path)
    # load
    cell = np.load(os.path.join(zip_path, f"{dsname}_cell"))
    labels = np.load(os.path.join(zip_path, f"{dsname}_labels"))
    peak = np.load(os.path.join(zip_path, f"{dsname}_peak"))
    x = sparse.load_npz(os.path.join(zip_path, f"{dsname}_x"))
    ids = {key: i for i, key in enumerate(sorted(set(labels)))}
    labels = one_hot(np.array([ids[i] for i in labels]), len(ids))
    return x, labels, peak, np.array(list(ids.keys()))
Пример #5
0
def _preprocess_xy(x, y, nb_classes):
  if x.ndim > 2:
    x = np.reshape(x, newshape=(x.shape[0], -1))
  if y is not None:
    if y.ndim == 1 and nb_classes > 2:
      y = one_hot(y, nb_classes=nb_classes)
    return x, y
  return x
Пример #6
0
def _preprocess_xy(x, y, nb_classes):
    if x.ndim > 2:
        x = np.reshape(x, newshape=(x.shape[0], -1))
    if y is not None:
        if y.ndim == 1 and nb_classes > 2:
            y = one_hot(y, nb_classes=nb_classes)
        return x, y
    return x
Пример #7
0
 def process(self, name, X, *args):
     _ = []
     for transcription in args:
         if isinstance(transcription, str):
             transcription = [i for i in transcription.split(' ')
                              if len(i) > 0]
         transcription = [int(i) for i in transcription]
         transcription = one_hot(transcription, n_classes=self._n_classes)
         _.append(transcription)
     return (name, X) + tuple(_)
Пример #8
0
 def __init__(self, path: str = "~/tensorflow_datasets/melanoma_atac"):
     path = os.path.abspath(os.path.expanduser(path))
     if not os.path.exists(path):
         os.makedirs(path)
     ### download data
     data = {}
     for url in _URL:
         fname = os.path.basename(url)
         fpath = os.path.join(path, fname)
         if not os.path.exists(fpath):
             print(f"Downloading file: {fname} ...")
             urlretrieve(url, filename=fpath)
         data[fname.split(".")[0]] = fpath
     ### load data
     try:
         import rpy2.robjects as robjects
         from rpy2.robjects import pandas2ri
         from rpy2.robjects.conversion import localconverter
         robjects.r['options'](warn=-1)
         robjects.r("library(Matrix)")
         pandas2ri.activate()
     except ImportError:
         raise ImportError("Require package 'rpy2' for reading Rdata file.")
     loaded_data = {}
     for k, v in data.items():
         robjects.r['load'](v)
         x = robjects.r[k]
         if k == "counts_mel":
             with localconverter(robjects.default_converter +
                                 pandas2ri.converter):
                 # dgCMatrix
                 x = sparse.csr_matrix(
                     (x.slots["x"], x.slots["i"], x.slots["p"]),
                     shape=tuple(robjects.r("dim")(x))[::-1],
                     dtype=np.float32)
         else:
             x = robjects.conversion.rpy2py(x)
         loaded_data[k] = x
     ### post-processing
     x = loaded_data['counts_mel']
     labels = []
     for i, j in zip(loaded_data["cellData_mel"]['cellLine'],
                     loaded_data["cellData_mel"]['LineType']):
         labels.append(i + '_' + j.split("-")[0])
     labels = np.array(labels)
     labels = np.array(labels)
     labels_name = {name: i for i, name in enumerate(sorted(set(labels)))}
     labels = one_hot(np.array([labels_name[i] for i in labels]),
                      len(labels_name))
     ### assign the data
     self.x = x
     self.y = labels
     self.xvar = np.array([f"Region{i + 1}" for i in range(x.shape[1])])
     self.yvar = np.array(list(labels_name.keys()))
Пример #9
0
 def process(self, name, X):
     data_idx = axis_normalize(axis=self.data_idx,
                               ndim=len(X),
                               return_tuple=True)
     X_new = []
     for idx, x in enumerate(X):
         # transform into one-label y
         if idx in data_idx:
             x = np.array(x, dtype='int32')
             x = one_hot(x, nb_classes=self.nb_classes)
         X_new.append(x)
     return name, X_new
Пример #10
0
def read_scale_dataset(dsname="leukemia",
                       filtered_genes=True,
                       override=False,
                       verbose=True) -> SingleCellOMIC:
    r""" Datasets provided by (Xiong et al. 2019), four datasets are supported:

    - 'breast_tumor'
    - 'forebrain'
    - 'leukemia'
    - 'insilico'

  Reference:
    Xiong, L. et al. SCALE method for single-cell ATAC-seq analysis via latent
      feature extraction. Nat Commun 10, 4576 (2019).

  """
    datasets = {'breast_tumor', 'forebrain', 'leukemia', 'insilico'}
    assert dsname in datasets, \
      f"Cannot find dataset with name {dsname}, available datasets are: {datasets}"
    download_path = os.path.join(DOWNLOAD_DIR, f"scale_dataset")
    preprocessed_path = os.path.join(DATA_DIR, f"scale_preprocessed")
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### Download data
    url = str(base64.decodebytes(_URL), 'utf-8')
    path = os.path.join(download_path, os.path.basename(url))
    download_file(url, path, override=False, md5=_MD5)
    ### extract the data
    if len(os.listdir(preprocessed_path)) == 0:
        with zipfile.ZipFile(path, "r") as f:
            for info in f.filelist:
                name = os.path.basename(info.filename)
                if len(name) == 0:
                    continue
                with open(os.path.join(preprocessed_path, name), 'wb') as fout:
                    fout.write(f.read(info))
    ### load the data
    cell = np.load(os.path.join(preprocessed_path, f"{dsname}_cell"))
    labels = np.load(os.path.join(preprocessed_path, f"{dsname}_labels"))
    peak = np.load(os.path.join(preprocessed_path, f"{dsname}_peak"))
    x = sparse.load_npz(os.path.join(preprocessed_path, f"{dsname}_x"))
    sco = SingleCellOMIC(X=x,
                         cell_id=cell,
                         gene_id=peak,
                         omic=OMIC.atac,
                         name=dsname)
    ids = {key: i for i, key in enumerate(sorted(set(labels)))}
    sco.add_omic(OMIC.celltype,
                 X=one_hot(np.array([ids[i] for i in labels]), len(ids)),
                 var_names=list(ids.keys()))
    return sco
Пример #11
0
def _read_scvi_dataset(name, clazz_name, override, verbose):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 '%s_preprocessed' % name),
                                    create_new=True)
    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            import scvi.dataset as scvi_dataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for PBMC dataset")
        clazz = getattr(scvi_dataset, clazz_name)
        gene_dataset = clazz(save_path=DOWNLOAD_DIR)

        X = gene_dataset._X
        if hasattr(X, 'todense'):
            X = np.array(X.todense())

        gene_names = np.array(gene_dataset.gene_names)
        # convert gene identifier to gene symbol (i.e. name)
        if hasattr(gene_dataset, 'de_metadata'):
            from sisua.data.utils import get_gene_id2name
            meta = gene_dataset.de_metadata
            converter = {i: j for i, j in zip(meta.ENSG, meta.GS)}
            pbmc8kconverter = get_gene_id2name()
            gene_names = np.array([
                pbmc8kconverter[i] if i in pbmc8kconverter else converter[i]
                for i in gene_names
            ])
        assert len(gene_names) == X.shape[1]

        label_names = np.array(gene_dataset.cell_types)
        y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names))
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])
        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Пример #12
0
def get_data():
    """ batch_size = 128 """
    batch = []
    batch_trans = []
    for name, start, end in indices:
        start = int(start)
        end = int(end)
        data = ds['mspec'][start:end]
        data = (data - data.mean(0)) / data.std(0)
        data = (data - mean) / std
        data = np.vstack([
            data[i:i + 21].reshape(1, -1) for i in range(0, data.shape[0], 21)
            if i + 21 < data.shape[0]
        ])
        trans = transcription[name]
        trans = np.array([int(i) for i in trans.split(' ') if len(i) > 0])
        trans = np.vstack([
            trans[i + 11].reshape(1, -1) for i in range(0, trans.shape[0], 21)
            if i + 21 < trans.shape[0]
        ])
        batch.append(data)
        batch_trans.append(trans)
        if len(batch) == cache:
            batch = np.vstack(batch)
            trans = one_hot(np.vstack(batch_trans).ravel(), 10)

            idx = np.random.permutation(batch.shape[0])
            batch = batch[idx]
            trans = trans[idx]

            i = 0
            while i < batch.shape[0]:
                start = i
                end = i + 128
                yield batch[start:end], trans[start:end]
                i = end

            batch = []
            batch_trans = []
Пример #13
0
def read_melanoma_cisTopicData(filtered_genes=True,
                               override=False,
                               verbose=True):
  r""" melanoma ATAC data from (Bravo González-Blas, et al. 2019)

  Reference:
    Bravo González-Blas, C. et al. cisTopic: cis-regulatory topic modeling
      on single-cell ATAC-seq data. Nat Methods 16, 397–400 (2019).
    Verfaillie, A. et al. Decoding the regulatory landscape of melanoma
      reveals TEADS as regulators of the invasive cell state.
      Nat Commun 6, (2015).
  """
  download_dir = os.path.join(DOWNLOAD_DIR, 'cistopic')
  if not os.path.exists(download_dir):
    os.makedirs(download_dir)
  preprocessed_path = os.path.join(DATA_DIR, 'cistopic_preprocessed')
  if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)
  ### downloading the data
  data = {}
  for url in _URL:
    fname = os.path.basename(url)
    fpath = os.path.join(download_dir, fname)
    if not os.path.exists(fpath):
      if verbose:
        print(f"Downloading file: {fname} ...")
      urlretrieve(url, filename=fpath)
    data[fname.split(".")[0]] = fpath
  ### preprocess data
  if len(os.listdir(preprocessed_path)) == 0:
    try:
      import rpy2.robjects as robjects
      from rpy2.robjects import pandas2ri
      from rpy2.robjects.conversion import localconverter
      robjects.r['options'](warn=-1)
      robjects.r("library(Matrix)")
      pandas2ri.activate()
    except ImportError:
      raise ImportError("Require package 'rpy2' for reading Rdata file.")
    for k, v in data.items():
      robjects.r['load'](v)
      x = robjects.r[k]
      outpath = os.path.join(preprocessed_path, k)
      if k == "counts_mel":
        with localconverter(robjects.default_converter + pandas2ri.converter):
          # dgCMatrix
          x = sparse.csr_matrix((x.slots["x"], x.slots["i"], x.slots["p"]),
                                shape=tuple(robjects.r("dim")(x))[::-1],
                                dtype=np.float32)
      else:
        x = robjects.conversion.rpy2py(x)
      with open(outpath, "wb") as f:
        pickle.dump(x, f)
      if verbose:
        print(f"Loaded file: {k} - {type(x)} - {x.shape}")
    pandas2ri.deactivate()
  ### load_data
  data = {}
  for name in os.listdir(preprocessed_path):
    with open(os.path.join(preprocessed_path, name), 'rb') as f:
      data[name] = pickle.load(f)
  ### sco
  # print(data["dm3_CtxRegions"])
  x = data['counts_mel']
  sco = SingleCellOMIC(X=x,
                       cell_id=data["cellData_mel"].index,
                       gene_id=[f"Region{i + 1}" for i in range(x.shape[1])],
                       omic=OMIC.atac)
  # celltype
  labels = []
  for i, j in zip(data["cellData_mel"]['cellLine'],
                  data["cellData_mel"]['LineType']):
    labels.append(i + '_' + j.split("-")[0])
  labels = np.array(labels)
  labels_name = {name: i for i, name in enumerate(sorted(set(labels)))}
  labels = np.array([labels_name[i] for i in labels])
  sco.add_omic(OMIC.celltype, one_hot(labels, len(labels_name)),
               list(labels_name.keys()))
  return sco
Пример #14
0
    def create_dataset(self,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=None,
                       partition='train',
                       inc_labels=False,
                       seed=1) -> tf.data.Dataset:
        r"""
    Arguments:
      partition : {'train', 'valid', 'test'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 1))`
        label - `(tf.float32, (None, 5))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)
        x = self.transform(partition)
        y = get_partition(partition,
                          train=self.train_labels,
                          valid=self.valid_labels,
                          test=self.test_labels)
        # remove empty docs
        indices = np.array(np.sum(x, axis=-1) > 0).ravel()
        x = x[indices]
        if len(y) > 0:
            y = y[indices]
        # convert to one-hot
        if inc_labels > 0 and len(y) > 0 and y.ndim == 1:
            y = one_hot(y, self.n_labels)

        def _process(*data):
            data = tuple([
                tf.cast(
                    tf.sparse.to_dense(i)
                    if isinstance(i, tf.SparseTensor) else i, tf.float32)
                for i in data
            ])
            if inc_labels:
                if 0. < inc_labels < 1.:  # semi-supervised mask
                    mask = gen.uniform(shape=(1, )) < inc_labels
                    return dict(inputs=tuple(data), mask=mask)
                return data
            return data[0]

        # prepare the sparse matrices
        if isinstance(x, spmatrix):
            x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())),
                                values=x.data,
                                dense_shape=x.shape)
        ds = tf.data.Dataset.from_tensor_slices(x)
        if inc_labels > 0:
            if isinstance(y, spmatrix):
                y = tf.SparseTensor(indices=sorted(zip(*y.nonzero())),
                                    values=y.data,
                                    dense_shape=y.shape)
            y = tf.data.Dataset.from_tensor_slices(y)
            ds = tf.data.Dataset.zip((ds, y))
        # configurate dataset
        ds = ds.map(_process, parallel)
        if cache is not None:
            ds = ds.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(int(shuffle))
        ds = ds.batch(batch_size, drop_remainder)
        if prefetch is not None:
            ds = ds.prefetch(prefetch)
        return ds
Пример #15
0
def read_centenarian(override=False, verbose=False):
    r""" Data used in:

    "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in
    supercentenarians" | bioRxiv [WWW Document], n.d.
      URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20).

  """
    download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed')
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        labels = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[2])),
            url=_URL[2],
        )
        data = []
        with gzip.open(labels, mode='rb') as f:
            for line in f:
                line = str(line, 'utf-8').strip().split('\t')
                assert line[1][:2] == line[2]
                data.append(line)
        labels = np.array(data)
        y_col = sorted(set(labels[:, 1]))
        y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]),
                    len(y_col)).astype('float32')
        y_col = np.array(y_col)
        #
        raw = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[0])),
            url=_URL[0],
        )
        if verbose:
            print("Unzip and reading raw UMI ...")
        X_raw, cell_id1, gene_id1 = read_gzip_csv(raw)
        #
        norm = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[1])),
            url=_URL[1],
        )
        if verbose:
            print("Unzip and reading log-norm UMI ...")
        X_norm, cell_id2, gene_id2 = read_gzip_csv(norm)
        #
        assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \
          np.all(gene_id1 == gene_id2)
        assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \
          X_raw.shape[1] == X_norm.shape[1] == len(gene_id1)
        #
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X=X_raw,
                        X_col=gene_id1,
                        y=y,
                        y_col=y_col,
                        rowname=cell_id1,
                        print_log=verbose)
        with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'),
                             shape=(0, X_norm.shape[1]),
                             dtype='float32',
                             remove_exist=True) as f:
            for s, e in batching(batch_size=2048, n=X_norm.shape[0]):
                f.write(X_norm[s:e])
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Пример #16
0
def streamline_classifier(Z_train,
                          y_train,
                          Z_test,
                          y_test,
                          labels_name,
                          mode='ovr',
                          title='',
                          plot_train_results=False,
                          show_plot=True,
                          return_figure=False):
    r"""
  Arguments:
    fig : Figure or tuple (`float`, `float`), optional (default=`None`)
      width, height in inches

  Returns:
    (results_train, results_test), (fig_train, fig_test)
      results is a dictionary of scores
      {
        F1micro=f1_micro * 100,
        F1macro=f1_macro * 100,
        F1weight=f1_weight * 100,
        F1_[classname]=...
      }
  """
    mode = mode.strip().lower()
    assert mode in ('ovr', 'ovo'), \
    "Only support ovr - one vs rest, ovo - one vs one; mode for streamline classifier"

    labels_name = [standardize_protein_name(i) for i in labels_name]

    results_train = {}
    results_test = {}
    labels_name = np.array(labels_name)

    with catch_warnings_ignore(FutureWarning):
        with catch_warnings_ignore(RuntimeWarning):
            n_classes = len(labels_name)
            # ====== preprocessing ====== #
            if y_train.ndim == 1 or y_train.shape[1] == 1:
                y_train = one_hot(y_train.ravel(), nb_classes=n_classes)
            if y_test.ndim == 1 or y_test.shape[1] == 1:
                y_test = one_hot(y_test.ravel(), nb_classes=n_classes)
            is_binary_classes = sorted(np.unique(
                y_train.astype('float32'))) == [0., 1.]
            # ====== not binary classes ====== #
            if not is_binary_classes:
                gmm = ProbabilisticEmbedding()
                gmm.fit(np.concatenate((y_train, y_test), axis=0))
                y_train = gmm.predict(y_train)
                y_test = gmm.predict(y_test)
            # kernel : 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
            if mode == 'ovr':
                classifier = OneVsRestClassifier(SVC(
                    kernel='linear', random_state=UNIVERSAL_RANDOM_SEED),
                                                 n_jobs=n_classes)
                classifier.fit(X=Z_train, y=y_train)
            else:
                raise NotImplementedError
                classifier = SVC(kernel='linear',
                                 decision_function_shape='ovo',
                                 random_state=UNIVERSAL_RANDOM_SEED)
                classifier.fit(X=Z_train, y=y_train)
            # ====== return ====== #
            from sklearn.exceptions import UndefinedMetricWarning
            with catch_warnings_ignore(UndefinedMetricWarning):
                results_train = plot_evaluate_classifier(
                    y_pred=classifier.predict(Z_train),
                    y_true=y_train,
                    labels=labels_name,
                    title='[train]' + title,
                    show_plot=show_plot and plot_train_results,
                    return_figure=True)
                results_test = plot_evaluate_classifier(
                    y_pred=classifier.predict(Z_test),
                    y_true=y_test,
                    labels=labels_name,
                    title='[test]' + title,
                    show_plot=show_plot,
                    return_figure=True)

            if show_plot:
                if plot_train_results:
                    results_train, fig_train = results_train[0], results_train[
                        1]
                else:
                    fig_train = None
                results_test, fig_test = results_test[0], results_test[1]
            results_train = OrderedDict(
                sorted(results_train.items(), key=lambda x: x[0]))
            results_test = OrderedDict(
                sorted(results_test.items(), key=lambda x: x[0]))
            results = (results_train, results_test)

            if show_plot and return_figure:
                return results, (fig_train, fig_test)
            return results
Пример #17
0
                   order='word', engine='odin'
    )
    tk.fit(texts, vocabulary=None)
    cPickle.dump(tk, open(tokenizer_path, 'w'), protocol=cPickle.HIGHEST_PROTOCOL)
print('========== Summary ==========')
for i, j in tk.summary.items():
    print(i, ':', j)

# ===========================================================================
# Build dataset
# ===========================================================================
X = tk.transform(texts, mode='seq', maxlen=MAX_SEQ_LEN,
                 end_document=None, token_not_found='ignore')

y = [labels_set.index(i) for i in labels]
y = one_hot(np.array(y, dtype='int32'), nb_classes=nb_labels)

n = X.shape[0]
np.random.seed(1208)
idx = np.random.permutation(n)
X = X[idx]
y = y[idx]

X_train = X[:int(0.8 * n)]
y_train = y[:int(0.8 * n)]
X_valid = X[int(0.8 * n):]
y_valid = y[int(0.8 * n):]

print('X:', X.shape, 'y:', y.shape)
print('X_train:', X_train.shape, 'y_train:', y_train.shape)
print('X_valid:', X_valid.shape, 'y_valid:', y_valid.shape)
Пример #18
0
MODEL_PATH = utils.get_modelpath(name='cifar10_%s' % MODEL_NAME, override=True)
LOG_PATH = utils.get_logpath(name='cifar10_%s.log' % MODEL_NAME, override=True)
stdio(LOG_PATH)
# ===========================================================================
# Some handmade constants
# ===========================================================================
NB_EPOCH = 10
LEARNING_RATE = 0.001
# ===========================================================================
# Load dataset
# ===========================================================================
ds = F.CIFAR10.get_dataset()
nb_labels = 10
print(ds)
X_train = ds['X_train'][:].astype('float32') / 255.
y_train = one_hot(ds['y_train'][:], nb_classes=nb_labels)
X_test = ds['X_test'][:].astype('float32') / 255.
y_test = one_hot(ds['y_test'][:], nb_classes=nb_labels)
# ===========================================================================
# Create network
# ===========================================================================
inputs = [K.placeholder(shape=(None,) + X_train.shape[1:], name='X', dtype='float32'),
          K.placeholder(shape=(None, nb_labels), name='y', dtype='float32')]
print("Inputs:", inputs)
model = N.Lambda.search(MODEL_NAME, prefix='models_cifar')
outputs = model(*inputs)
# ====== create losses ====== #
ce = tf.losses.softmax_cross_entropy(inputs[-1], outputs['logit'])
acc = K.metrics.categorical_accuracy(outputs['prob'], inputs[-1])
cm = K.metrics.confusion_matrix(y_pred=outputs['prob'],
                                y_true=inputs[-1],
Пример #19
0
def plot_evaluate_classifier(y_pred,
                             y_true,
                             labels,
                             title,
                             show_plot=True,
                             return_figure=False):
    r"""
  Arguments:
    fig : Figure or tuple (`float`, `float`), optional (default=`None`)
      width, height in inches

  Returns:
    Return a dictionary of scores
    {
        F1micro=f1_micro * 100,
        F1macro=f1_macro * 100,
        F1weight=f1_weight * 100,
        F1_[classname]=...
    }
  """
    from matplotlib import pyplot as plt
    fontsize = 12
    num_classes = len(labels)
    nrow = int(np.ceil(num_classes / 5))
    ncol = int(np.ceil(num_classes / nrow))

    if y_pred.ndim == 1:
        y_pred = one_hot(y_pred, nb_classes=num_classes)
    if y_true.ndim == 1:
        y_true = one_hot(y_true, nb_classes=num_classes)

    if show_plot:
        fig = plot_figure(nrow=4 * nrow + 3, ncol=4 * ncol)

    f1_classes = []
    for i, (name, pred, true) in enumerate(zip(labels, y_pred.T, y_true.T)):
        f1_classes.append(f1_score(true, pred))
        if show_plot:
            plot_confusion_matrix(confusion_matrix(y_true=true, y_pred=pred),
                                  labels=[0, 1],
                                  fontsize=fontsize,
                                  ax=(nrow, ncol, i + 1),
                                  title=name + '\n')

    f1_micro = f1_score(y_true=y_true.ravel(), y_pred=y_pred.ravel())
    f1_macro = np.mean(f1_classes)
    f1_weight = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

    if show_plot:
        plt.suptitle('%s\nF1-micro:%.2f  F1-macro:%.2f  F1-weight:%.2f' %
                     (title, f1_micro * 100, f1_macro * 100, f1_weight * 100),
                     fontsize=fontsize + 6)
        plt.tight_layout(rect=[0, 0.04, 1, 0.96])
    results = dict(
        F1micro=f1_micro * 100,
        F1macro=f1_macro * 100,
        F1weight=f1_weight * 100,
    )
    for name, f1 in zip(labels, f1_classes):
        results['F1_' + name] = f1 * 100
    if show_plot and return_figure:
        return results, fig
    return results
Пример #20
0
def evaluate(y_true, y_pred_proba=None, y_pred_log_proba=None,
             labels=None, title='', path=None,
             xlims=None, ylims=None, print_log=True):
  from odin.backend import to_llr
  from odin.backend.metrics import (det_curve, compute_EER, roc_curve,
                                    compute_Cavg, compute_Cnorm,
                                    compute_minDCF)

  def format_score(s):
    return ctext('%.4f' % s if is_number(s) else s, 'yellow')
  nb_classes = None
  # ====== check y_pred ====== #
  if y_pred_proba is None and y_pred_log_proba is None:
    raise ValueError("At least one of `y_pred_proba` or `y_pred_log_proba` "
                     "must not be None")
  y_pred_llr = to_llr(y_pred_proba) if y_pred_log_proba is None \
      else to_llr(y_pred_log_proba)
  nb_classes = y_pred_llr.shape[1]
  y_pred = np.argmax(y_pred_llr, axis=-1)
  # ====== check y_true ====== #
  if isinstance(y_true, (tuple, list)):
    y_true = np.array(y_true)
  if y_true.ndim == 2: # convert one-hot to labels
    y_true = np.argmax(y_true, axis=-1)
  # ====== check labels ====== #
  if labels is None:
    labels = [str(i) for i in range(nb_classes)]
  # ====== scoring ====== #
  if y_pred_proba is None:
    ll = 'unknown'
  else:
    ll = log_loss(y_true=y_true, y_pred=y_pred_proba)
  acc = accuracy_score(y_true=y_true, y_pred=y_pred)
  cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
  # C_norm
  cnorm, cnorm_arr = compute_Cnorm(y_true=y_true,
                                   y_score=y_pred_llr,
                                   Ptrue=[0.1, 0.5],
                                   probability_input=False)
  if y_pred_log_proba is not None:
    cnorm_, cnorm_arr_ = compute_Cnorm(y_true=y_true,
                                       y_score=y_pred_log_proba,
                                       Ptrue=[0.1, 0.5],
                                       probability_input=False)
    if np.mean(cnorm) > np.mean(cnorm_): # smaller is better
      cnorm, cnorm_arr = cnorm_, cnorm_arr_
  # DET
  Pfa, Pmiss = det_curve(y_true=y_true, y_score=y_pred_llr)
  eer = compute_EER(Pfa=Pfa, Pmiss=Pmiss)
  minDCF = compute_minDCF(Pfa, Pmiss)[0]
  # PRINT LOG
  if print_log:
    print(ctext("--------", 'red'), ctext(title, 'cyan'))
    print("Log loss :", format_score(ll))
    print("Accuracy :", format_score(acc))
    print("C_norm   :", format_score(np.mean(cnorm)))
    print("EER      :", format_score(eer))
    print("minDCF   :", format_score(minDCF))
    print(print_confusion(arr=cm, labels=labels))
  # ====== save report to PDF files if necessary ====== #
  if path is not None:
    if y_pred_proba is None:
      y_pred_proba = y_pred_llr
    from matplotlib import pyplot as plt
    plt.figure(figsize=(nb_classes, nb_classes + 1))
    plot_confusion_matrix(cm, labels)
    # Cavg
    plt.figure(figsize=(nb_classes + 1, 3))
    plot_Cnorm(cnorm=cnorm_arr, labels=labels, Ptrue=[0.1, 0.5],
               fontsize=14)
    # binary classification
    if nb_classes == 2 and \
    (y_pred_proba.ndim == 1 or (y_pred_proba.ndim == 2 and
                                y_pred_proba.shape[1] == 1)):
      fpr, tpr = roc_curve(y_true=y_true, y_score=y_pred_proba.ravel())
      # det curve
      plt.figure()
      plot_detection_curve(Pfa, Pmiss, curve='det',
                           xlims=xlims, ylims=ylims, linewidth=1.2)
      # roc curve
      plt.figure()
      plot_detection_curve(fpr, tpr, curve='roc')
    # multiclasses
    else:
      y_true = one_hot(y_true, nb_classes=nb_classes)
      fpr_micro, tpr_micro, _ = roc_curve(y_true=y_true.ravel(),
                                          y_score=y_pred_proba.ravel())
      Pfa_micro, Pmiss_micro = Pfa, Pmiss
      fpr, tpr = [], []
      Pfa, Pmiss = [], []
      for i, yi in enumerate(y_true.T):
        curve = roc_curve(y_true=yi, y_score=y_pred_proba[:, i])
        fpr.append(curve[0])
        tpr.append(curve[1])
        curve = det_curve(y_true=yi, y_score=y_pred_llr[:, i])
        Pfa.append(curve[0])
        Pmiss.append(curve[1])
      plt.figure()
      plot_detection_curve(fpr_micro, tpr_micro, curve='roc',
                           linewidth=1.2, title="ROC Micro")
      plt.figure()
      plot_detection_curve(fpr, tpr, curve='roc',
                           labels=labels, linewidth=1.0,
                           title="ROC for each classes")
      plt.figure()
      plot_detection_curve(Pfa_micro, Pmiss_micro, curve='det',
                           xlims=xlims, ylims=ylims, linewidth=1.2,
                           title="DET Micro")
      plt.figure()
      plot_detection_curve(Pfa, Pmiss, curve='det',
                           xlims=xlims, ylims=ylims,
                           labels=labels, linewidth=1.0,
                           title="DET for each classes")
    plot_save(path)
Пример #21
0
                 protocol=cPickle.HIGHEST_PROTOCOL)
print('========== Summary ==========')
for i, j in tk.summary.items():
    print(i, ':', j)

# ===========================================================================
# Build dataset
# ===========================================================================
X = tk.transform(texts,
                 mode='seq',
                 maxlen=MAX_SEQ_LEN,
                 end_document=None,
                 token_not_found='ignore')

y = [labels_set.index(i) for i in labels]
y = one_hot(np.array(y, dtype='int32'), nb_classes=nb_labels)

n = X.shape[0]
np.random.seed(1208)
idx = np.random.permutation(n)
X = X[idx]
y = y[idx]

X_train = X[:int(0.8 * n)]
y_train = y[:int(0.8 * n)]
X_valid = X[int(0.8 * n):]
y_valid = y[int(0.8 * n):]

print('X:', X.shape, 'y:', y.shape)
print('X_train:', X_train.shape, 'y_train:', y_train.shape)
print('X_valid:', X_valid.shape, 'y_valid:', y_valid.shape)
Пример #22
0
    def create_dataset(self,
                       batch_size=64,
                       image_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       prefetch=tf.data.experimental.AUTOTUNE,
                       cache='',
                       parallel=tf.data.experimental.AUTOTUNE,
                       partition='train',
                       inc_labels=True,
                       seed=1) -> tf.data.Dataset:
        r"""
    Arguments:
      partition : {'train', 'valid', 'test', 'unlablled'}
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 3))`
        label - `(tf.float32, (None, 10))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
        if isinstance(image_size, Number) and image_size == 96:
            image_size = None
        ### select partition
        images_path, labels_path = _partition(
            partition,
            train=(self.bin_files['train_X'], self.bin_files['train_y']),
            test=(self.bin_files['test_X'], self.bin_files['test_y']),
            unlabeled=(self.bin_files['unlabeled_X'], None),
        )
        X = np.reshape(np.fromfile(images_path, dtype=np.uint8),
                       (-1, ) + SLT10.IMAGE_SHAPE)
        if labels_path is None:  # unlabled data
            inc_labels = False
        inc_labels = float(inc_labels)
        gen = tf.random.experimental.Generator.from_seed(seed=seed)
        if inc_labels:
            y = np.fromfile(labels_path, dtype=np.uint8) - 1
            y = one_hot(y, len(self.class_names)).astype(np.float32)
        ### read and resize the data
        def resize(img):
            img = tf.clip_by_value(
                tf.cast(img, tf.float32) / 255., 1e-6, 1. - 1e-6)
            img = tf.transpose(img, perm=(1, 2, 0))
            if image_size is not None:
                img = tf.image.resize(img, (image_size, image_size),
                                      preserve_aspect_ratio=False,
                                      antialias=False)
            return img

        def masking(image, label):
            mask = gen.uniform(shape=(1, )) < inc_labels
            return dict(inputs=(image, label), mask=mask)

        ### processing
        images = tf.data.Dataset.from_tensor_slices(X).map(resize, parallel)
        if inc_labels:
            labels = tf.data.Dataset.from_tensor_slices(y)
            images = tf.data.Dataset.zip((images, labels))
            if 0. < inc_labels < 1.:  # semi-supervised mask
                images = images.map(masking)
        # cache data
        if cache is not None:
            images = images.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None:
            images = images.shuffle(int(shuffle))
        images = images.batch(batch_size, drop_remainder)
        if prefetch is not None:
            images = images.prefetch(prefetch)
        return images
def read_human_embryos(filtered_genes=True,
                       override=False,
                       verbose=True) -> SingleCellOMIC:
    r""" Transcriptional map of human embryo development, including the sequenced
    transcriptomes of 1529 individual cells from 88 human preimplantation
    embryos. These data show that cells undergo an intermediate state of
    co-expression of lineage-specific genes, followed by a concurrent
    establishment of the trophectoderm, epiblast, and primitive endoderm
    lineages, which coincide with blastocyst formation.

  References:
    Petropoulos S, Edsgärd D, Reinius B, et al. Single-Cell RNA-Seq Reveals
      Lineage and X Chromosome Dynamics in Human Preimplantation Embryos.
      Cell. 2016 Sep

  Note:
    Gene expression levels (RefSeq annotations) were estimated in terms of
      reads per kilobase exon model and per million mapped reads (RPKM)
      using rpkmforgenes
    Genes were filtered, keeping 15633/26178 genes that
      * were expressed in at least 5 out of 1919 sequenced cells (RPKM >= 10).
        and
      * for which cells with expression came from at least two
        different embryos.
    Cells were quality-filtered based on 4 criteria, keeping 1529/1919 cells.
      * First, Spearman correlations, using the RPKM expression levels of
        all genes, for every possible pair of cells were calculated and a
        histogram of the maximum correlation obtained for each cell,
        corresponding to the most similar cell, was used to identify 305
        outlier cells with a maximum pair-wise correlations below 0.63.
      * Second, a histogram of the number of expressed genes per cell was
        used to identify 330 outlier cells with less than 5000 expressed
        genes.
      * Third, a histogram of the total transcriptional expression output
        from the sex chromosomes (RPKM sum) was used to identify 33 cells
        with indeterminable sex, or a called sex that was inconsistent with
        other cells of that embryo
      * Fourth, 13 outlier cells were identified using PCA and t-SNE
        dimensionality reduction.

  """
    download_dir = os.path.join(DOWNLOAD_DIR, 'human_embryos')
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    preprocessed_path = os.path.join(DATA_DIR, 'human_embryos_preprocessed')
    if override:
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### download data
    files = []
    for url, md5 in zip(_URLs, _MD5s):
        path = download_file(url=url,
                             filename=os.path.join(download_dir,
                                                   os.path.basename(url)),
                             override=False,
                             md5=md5)
        files.append(path)
    ### preprocessing
    if len(os.listdir(preprocessed_path)) == 0:
        data_map = {}
        for f in files:
            zipname = os.path.basename(f)
            with zipfile.ZipFile(f, mode="r") as f:
                for dat_file in f.filelist:
                    filename = dat_file.filename
                    dat = str(f.read(filename), 'utf-8')
                    x = []
                    for line in dat.split('\n'):
                        if len(line) == 0:
                            continue
                        line = line.split('\t')
                        x.append(line)
                    x = np.asarray(x).T
                    row_name = x[1:, 0]
                    col_name = x[0, 1:]
                    x = x[1:, 1:].astype(np.float32)
                    x = sparse.coo_matrix(x)
                    data_map[filename] = (x, row_name, col_name)
                    print(f"Read: {zipname} - {filename}")
                    print(f" * Matrix: {x.shape}")
                    print(f" * Row   : {row_name.shape}-{row_name[:3]}")
                    print(f" * Col   : {col_name.shape}-{col_name[:3]}")
        # save loaded data to disk
        for name, (x, row, col) in data_map.items():
            with open(os.path.join(preprocessed_path, f"{name}:x"), "wb") as f:
                sparse.save_npz(f, x)
            with open(os.path.join(preprocessed_path, f"{name}:row"),
                      "wb") as f:
                np.save(f, row)
            with open(os.path.join(preprocessed_path, f"{name}:col"),
                      "wb") as f:
                np.save(f, col)
        del data_map
    ### read the data
    # counts.txt (1529, 26178)
    # ercc.counts.txt (1529, 92)
    # rpkm.txt (1529, 26178)
    # ercc.rpkm.txt (1529, 92)
    data = {}
    genes_path = os.path.join(preprocessed_path, "filtered_genes")
    for path in os.listdir(preprocessed_path):
        if path == os.path.basename(genes_path):
            continue
        name, ftype = os.path.basename(path).split(':')
        with open(os.path.join(preprocessed_path, path), 'rb') as f:
            if ftype == 'x':
                x = sparse.load_npz(f).tocsr()
            else:
                x = np.load(f)
        data[f"{name}_{ftype}"] = x
    rpkm = data['rpkm.txt_x']
    counts = data['counts.txt_x']
    genes = data['counts.txt_col']
    cells = data['counts.txt_row']
    ### filter genes
    if not os.path.exists(genes_path):
        # filter genes by rpkm
        ids = np.asarray(np.sum(rpkm, axis=0) >= 10).ravel()
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
        # filter genes by min 5 cells
        ids = np.asarray(np.sum(counts > 0, axis=0) >= 5).ravel()
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
        # filter highly variable genes
        sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes)
        sco.normalize(omic=OMIC.transcriptomic, log1p=True)
        sco.filter_highly_variable_genes(n_top_genes=2000)
        filtered = sco.var_names.to_numpy()
        with open(genes_path, 'wb') as f:
            pickle.dump([genes, filtered], f)
        del sco
    else:
        with open(genes_path, 'rb') as f:
            ids, filtered = pickle.load(f)
        ids = set(ids)
        ids = np.asarray([i in ids for i in genes])
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
    # last filtering
    if filtered_genes:
        filtered = set(filtered)
        ids = np.asarray([i in filtered for i in genes])
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
    ### create the SingleCellOMIC
    sco = SingleCellOMIC(X=counts,
                         cell_id=cells,
                         gene_id=genes,
                         omic=OMIC.transcriptomic,
                         name="HumanEmbryos")
    sco.add_omic(omic=OMIC.rpkm, X=rpkm, var_names=genes)
    labels = ['.'.join(i.split('.')[:-2]) for i in sco.obs_names]
    labels = ['E7' if i == 'E7.4' else i for i in labels]
    labels_name = {j: i for i, j in enumerate(sorted(set(labels)))}
    labels = np.array([labels_name[i] for i in labels])
    sco.add_omic(omic=OMIC.celltype,
                 X=one_hot(labels, len(labels_name)),
                 var_names=list(labels_name.keys()))
    sco.add_omic(omic=OMIC.ercc,
                 X=data['ercc.counts.txt_x'],
                 var_names=data['ercc.counts.txt_col'])
    return sco
Пример #24
0
  def __init__(self, path='~/tensorflow_datasets/mnist'):
    path = os.path.abspath(os.path.expanduser(path))
    save_path = os.path.join(path, 'mnist.npz')
    if not os.path.exists(path):
      os.makedirs(path)
    assert os.path.isdir(path)

    ## check exist processed file
    all_data = None
    if os.path.exists(save_path):
      if not os.path.isfile(save_path):
        raise ValueError("path to %s must be a file" % save_path)
      if md5_checksum(save_path) != MNIST.MD5:
        print("Miss match MD5 remove file at: ", save_path)
        os.remove(save_path)
      else:
        all_data = np.load(save_path)
    ## download and extract
    if all_data is None:
      from tqdm import tqdm

      def dl_progress(count, block_size, total_size):
        kB = block_size * count / 1024.
        prog.update(kB - prog.n)

      read32 = lambda b: np.frombuffer(
          b, dtype=np.dtype(np.uint32).newbyteorder('>'))[0]

      all_data = {}
      for name, url in MNIST.URL.items():
        basename = os.path.basename(url)
        zip_path = os.path.join(path, basename)
        prog = tqdm(desc="Downloading %s" % basename, unit='kB')
        urlretrieve(url, zip_path, dl_progress)
        prog.clear()
        prog.close()
        with gzip.open(zip_path, "rb") as f:
          magic = read32(f.read(4))
          if magic not in (2051, 2049):
            raise ValueError('Invalid magic number %d in MNIST image file: %s' %
                             (magic, zip_path))
          n = read32(f.read(4))
          # images
          if 'X_' in name:
            rows = read32(f.read(4))
            cols = read32(f.read(4))
            buf = f.read(rows * cols * n)
            data = np.frombuffer(buf, dtype=np.uint8)
            data = data.reshape(n, rows, cols, 1)
          # labels
          else:
            buf = f.read(n)
            data = np.frombuffer(buf, dtype=np.uint8)
            data = one_hot(data, 10)
          all_data[name] = data
      np.savez_compressed(save_path, **all_data)
    ## split train, valid, test
    rand = np.random.RandomState(seed=1)
    ids = rand.permutation(all_data['X_train'].shape[0])
    X_train = all_data['X_train'][ids]
    y_train = all_data['y_train'][ids]
    X_valid = X_train[:5000]
    y_valid = y_train[:5000]
    X_train = X_train[5000:]
    y_train = y_train[5000:]
    X_test = all_data['X_test']
    y_test = all_data['y_test']
    to_ds = lambda images, labels: tf.data.Dataset.zip(
        (tf.data.Dataset.from_tensor_slices(images),
         tf.data.Dataset.from_tensor_slices(labels)))
    self.train = to_ds(X_train, y_train)
    self.valid = to_ds(X_valid, y_valid)
    self.test = to_ds(X_test, y_test)
Пример #25
0
Файл: base.py Проект: imito/odin
def evaluate(y_true, y_pred_proba=None, y_pred_log_proba=None,
             labels=None, title='', path=None,
             xlims=None, ylims=None, print_log=True):
  from odin.backend import to_llr
  from odin.backend.metrics import (det_curve, compute_EER, roc_curve,
                                    compute_Cavg, compute_Cnorm,
                                    compute_minDCF)

  def format_score(s):
    return ctext('%.4f' % s if is_number(s) else s, 'yellow')
  nb_classes = None
  # ====== check y_pred ====== #
  if y_pred_proba is None and y_pred_log_proba is None:
    raise ValueError("At least one of `y_pred_proba` or `y_pred_log_proba` "
                     "must not be None")
  y_pred_llr = to_llr(y_pred_proba) if y_pred_log_proba is None \
      else to_llr(y_pred_log_proba)
  nb_classes = y_pred_llr.shape[1]
  y_pred = np.argmax(y_pred_llr, axis=-1)
  # ====== check y_true ====== #
  if isinstance(y_true, Data):
    y_true = y_true.array
  if isinstance(y_true, (tuple, list)):
    y_true = np.array(y_true)
  if y_true.ndim == 2: # convert one-hot to labels
    y_true = np.argmax(y_true, axis=-1)
  # ====== check labels ====== #
  if labels is None:
    labels = [str(i) for i in range(nb_classes)]
  # ====== scoring ====== #
  if y_pred_proba is None:
    ll = 'unknown'
  else:
    ll = log_loss(y_true=y_true, y_pred=y_pred_proba)
  acc = accuracy_score(y_true=y_true, y_pred=y_pred)
  cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
  # C_norm
  cnorm, cnorm_arr = compute_Cnorm(y_true=y_true,
                                   y_score=y_pred_llr,
                                   Ptrue=[0.1, 0.5],
                                   probability_input=False)
  if y_pred_log_proba is not None:
    cnorm_, cnorm_arr_ = compute_Cnorm(y_true=y_true,
                                       y_score=y_pred_log_proba,
                                       Ptrue=[0.1, 0.5],
                                       probability_input=False)
    if np.mean(cnorm) > np.mean(cnorm_): # smaller is better
      cnorm, cnorm_arr = cnorm_, cnorm_arr_
  # DET
  Pfa, Pmiss = det_curve(y_true=y_true, y_score=y_pred_llr)
  eer = compute_EER(Pfa=Pfa, Pmiss=Pmiss)
  minDCF = compute_minDCF(Pfa, Pmiss)[0]
  # PRINT LOG
  if print_log:
    print(ctext("--------", 'red'), ctext(title, 'cyan'))
    print("Log loss :", format_score(ll))
    print("Accuracy :", format_score(acc))
    print("C_norm   :", format_score(np.mean(cnorm)))
    print("EER      :", format_score(eer))
    print("minDCF   :", format_score(minDCF))
    print(print_confusion(arr=cm, labels=labels))
  # ====== save report to PDF files if necessary ====== #
  if path is not None:
    if y_pred_proba is None:
      y_pred_proba = y_pred_llr
    from matplotlib import pyplot as plt
    plt.figure(figsize=(nb_classes, nb_classes + 1))
    plot_confusion_matrix(cm, labels)
    # Cavg
    plt.figure(figsize=(nb_classes + 1, 3))
    plot_Cnorm(cnorm=cnorm_arr, labels=labels, Ptrue=[0.1, 0.5],
               fontsize=14)
    # binary classification
    if nb_classes == 2 and \
    (y_pred_proba.ndim == 1 or (y_pred_proba.ndim == 2 and
                                y_pred_proba.shape[1] == 1)):
      fpr, tpr = roc_curve(y_true=y_true, y_score=y_pred_proba.ravel())
      # det curve
      plt.figure()
      plot_detection_curve(Pfa, Pmiss, curve='det',
                           xlims=xlims, ylims=ylims, linewidth=1.2)
      # roc curve
      plt.figure()
      plot_detection_curve(fpr, tpr, curve='roc')
    # multiclasses
    else:
      y_true = one_hot(y_true, nb_classes=nb_classes)
      fpr_micro, tpr_micro, _ = roc_curve(y_true=y_true.ravel(),
                                          y_score=y_pred_proba.ravel())
      Pfa_micro, Pmiss_micro = Pfa, Pmiss
      fpr, tpr = [], []
      Pfa, Pmiss = [], []
      for i, yi in enumerate(y_true.T):
        curve = roc_curve(y_true=yi, y_score=y_pred_proba[:, i])
        fpr.append(curve[0])
        tpr.append(curve[1])
        curve = det_curve(y_true=yi, y_score=y_pred_llr[:, i])
        Pfa.append(curve[0])
        Pmiss.append(curve[1])
      plt.figure()
      plot_detection_curve(fpr_micro, tpr_micro, curve='roc',
                           linewidth=1.2, title="ROC Micro")
      plt.figure()
      plot_detection_curve(fpr, tpr, curve='roc',
                           labels=labels, linewidth=1.0,
                           title="ROC for each classes")
      plt.figure()
      plot_detection_curve(Pfa_micro, Pmiss_micro, curve='det',
                           xlims=xlims, ylims=ylims, linewidth=1.2,
                           title="DET Micro")
      plt.figure()
      plot_detection_curve(Pfa, Pmiss, curve='det',
                           xlims=xlims, ylims=ylims,
                           labels=labels, linewidth=1.0,
                           title="DET for each classes")
    plot_save(path)
Пример #26
0
def read_mouse_ATLAS(filtered_genes=True,
                     override=False,
                     verbose=True) -> SingleCellOMIC:
    r""" sci-ATAC-seq, to profile genome-wide chromatin accessibility in ∼100,000
  single cells from 13 adult mouse tissues:

    - The regulatory landscape of adult mouse tissues mapped by single-cell
      chromatin assay
    - Characterization of 85 distinct chromatin patterns across 13 different
      tissues
    - Annotation of key regulators and regulatory sequences in diverse
      mammalian cell types
    - Dataset allows resolution of cell types underlying common human traits
      and diseases

  References:
    Cusanovich, D. A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin
      Accessibility. Cell 174, 1309-1324.e18 (2018).
    Link https://atlas.gs.washington.edu/mouse-atac/
  """
    download_path = os.path.join(DOWNLOAD_DIR, f"mouse_atac")
    preprocessed_path = os.path.join(DATA_DIR, f"mouse_atac_preprocessed")
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### Download data
    files = {}
    for name, (url, md5) in _URLs.items():
        filepath = os.path.join(download_path, os.path.basename(url))
        files[name] = download_file(url, filepath, override=False, md5=md5)
    ### save counts matrix
    path = os.path.join(preprocessed_path, 'counts')
    if not os.path.exists(path):
        print("Reading counts matrix ...")
        counts = mmread(files['counts'])
        counts: sparse.coo_matrix
        counts = counts.astype(np.unit8)
        with open(path, 'wb') as f:
            sparse.save_npz(f, counts, compressed=False)
    ### save metadata
    path = os.path.join(preprocessed_path, 'metadata')
    if not os.path.exists(path):
        with open(files['cellids'], 'r') as f:
            cell = np.array([i for i in f.read().split('\n') if len(i) > 0])
        with open(files['peakids'], 'r') as f:
            peak = np.array([i for i in f.read().split('\n') if len(i) > 0])
        metadata = pd.read_csv(files['metadata'], sep="\t")
        assert metadata.shape[0] == len(cell)
        tissue = metadata['tissue'].to_numpy()
        celltype = metadata['cell_label'].to_numpy()
        with open(path, 'wb') as f:
            np.savez(f, cell=cell, peak=peak, tissue=tissue, celltype=celltype)
    ### Read all data and create SCO
    counts = sparse.csr_matrix(
        sparse.load_npz(os.path.join(preprocessed_path, 'counts')))
    metadata = np.load(os.path.join(preprocessed_path, 'metadata'),
                       allow_pickle=True)
    cell = metadata['cell']
    peak = metadata['peak']
    tissue = metadata['tissue']
    celltype = metadata['celltype']
    # need to transpose here, counts matrix is [peaks, cells]
    sco = SingleCellOMIC(X=counts.T,
                         cell_id=cell,
                         gene_id=peak,
                         omic=OMIC.atac,
                         name="mouse_atlas")
    # add celltype
    labels = {name: i for i, name in enumerate(sorted(set(celltype)))}
    sco.add_omic(OMIC.celltype,
                 X=one_hot(np.array([labels[i] for i in celltype]),
                           len(labels)),
                 var_names=list(labels.keys()))
    # add tissue type
    labels = {name: i for i, name in enumerate(sorted(set(tissue)))}
    sco.add_omic(OMIC.tissue,
                 X=one_hot(np.array([labels[i] for i in tissue]), len(labels)),
                 var_names=list(labels.keys()))
    return sco
Пример #27
0
def to_array(x):
    """ pytorch tensor to numpy array """
    if hasattr(x, 'todense'):
        return np.array(x.todense())
    if hasattr(x, 'cpu'):
        return x.data.cpu().numpy()
    return x


# Load dataset
cortex = CortexDataset(save_path=SAVE_DATA_PATH)
X = cortex.X
labels = cortex.cell_types
n_labels = len(labels)
Y = one_hot(cortex.labels.ravel(), n_labels)

# ===========================================================================
# scVI
# ===========================================================================
scvi = VAE(n_input=cortex.nb_genes,
           n_batch=0,
           n_labels=0,
           n_hidden=n_hidden,
           n_latent=n_latent,
           n_layers=n_layer,
           dispersion=dispersion,
           dropout_rate=dropout_rate,
           log_variational=log_variational)
trainer = UnsupervisedTrainer(model=scvi,
                              gene_dataset=cortex,
Пример #28
0
  def create_dataset(self,
                     batch_size=64,
                     drop_remainder=False,
                     shuffle=1000,
                     prefetch=tf.data.experimental.AUTOTUNE,
                     cache='',
                     parallel=tf.data.experimental.AUTOTUNE,
                     partition='train',
                     inc_labels=True,
                     seed=1) -> tf.data.Dataset:
    r"""
    Arguments:
      partition : {'train', 'train_labelled', 'valid', 'test', 'unlabelled'}
        - 'train' : combination of both train and unlablled
        - 'train-labelled' : only the train data
      inc_labels : a Boolean or Scalar. If True, return both image and label,
        otherwise, only image is returned.
        If a scalar is provided, it indicate the percent of labelled data
        in the mask.

    Return :
      tensorflow.data.Dataset :
        image - `(tf.float32, (None, 64, 64, 3))`
        label - `(tf.float32, (None, 10))`
        mask  - `(tf.bool, (None, 1))` if 0. < inc_labels < 1.
      where, `mask=1` mean labelled data, and `mask=0` for unlabelled data
    """
    image_size = self.image_size
    if isinstance(image_size, Number) and image_size == 96:
      image_size = None
    ### select partition
    images_path, labels_path = get_partition(
        partition,
        train=((self.bin_files['train_X'], self.bin_files['unlabeled_X']),
               self.bin_files['train_y']),
        train_labelled=(self.bin_files['train_X'], self.bin_files['train_y']),
        test=(self.bin_files['test_X'], self.bin_files['test_y']),
        unlabeled=(self.bin_files['unlabeled_X'], None),
        unlabelled=(self.bin_files['unlabeled_X'], None),
    )

    X = [
        np.reshape(np.fromfile(path, dtype=np.uint8), (-1,) + STL10.IMAGE_SHAPE)
        for path in tf.nest.flatten(images_path)
    ]
    is_unlabelled = (labels_path is None)
    inc_labels = float(inc_labels)
    gen = tf.random.experimental.Generator.from_seed(seed=seed)
    # load the labels
    if inc_labels:
      if is_unlabelled:
        y = [np.zeros(shape=(X[0].shape[0], self.n_labels), dtype=np.float32)]
      else:
        y = np.fromfile(labels_path, dtype=np.uint8) - 1
        y = [one_hot(y, self.n_labels).astype(np.float32)]
        if len(X) == 2:  # combined of both train and unlablled set
          y.append(
              np.zeros(shape=(X[1].shape[0], self.n_labels), dtype=np.float32))
      assert len(y) == len(X)

    ### read and resize the data
    def resize(img):
      img = tf.cast(img, tf.float32)
      img = self.normalize_255(img)
      img = tf.transpose(img, perm=(2, 1, 0))
      if image_size is not None:
        img = tf.image.resize(img, (image_size, image_size),
                              preserve_aspect_ratio=True,
                              antialias=False)
      return img

    def masking(image, label):
      mask = tf.logical_and(
          gen.uniform(shape=(1,)) < inc_labels,
          tf.reduce_sum(label) > 0.)
      return dict(inputs=(image, label), mask=mask)

    ### processing
    datasets = None
    must_masking = any(np.all(i == 0.) for i in y)
    for x_i, y_i in zip(X, y if inc_labels else X):
      images = tf.data.Dataset.from_tensor_slices(x_i).map(resize, parallel)
      if inc_labels:
        labels = tf.data.Dataset.from_tensor_slices(y_i)
        images = tf.data.Dataset.zip((images, labels))
        if 0. < inc_labels < 1. or must_masking:  # semi-supervised mask
          images = images.map(masking)
      datasets = images if datasets is None else datasets.concatenate(images)
    # cache data
    if cache is not None:
      datasets = datasets.cache(str(cache))
    # shuffle must be called after cache
    if shuffle is not None and shuffle > 0:
      datasets = datasets.shuffle(int(shuffle) * len(X))
    datasets = datasets.batch(batch_size, drop_remainder)
    if prefetch is not None:
      datasets = datasets.prefetch(prefetch)
    # return
    return datasets
def _celltypes(y):
  labels = sorted(np.unique(y))
  index = {name: i for i, name in enumerate(labels)}
  y = one_hot(np.array([index[i] for i in y], dtype=np.int32),
              nb_classes=len(labels))
  return y, [i.replace("_Like", '').lower() for i in labels]