示例#1
0
    def create_dataset(self,
                       input_files: OneOrMany[str],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = None) -> DiskDataset:
        """Creates a `Dataset` from input FASTA files.

    At present, FASTA support is limited and only allows for one-hot
    featurization, and doesn't allow for sharding.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str, optional
      Name of directory where featurized data is stored.
    shard_size: int, optional
      For now, this argument is ignored and each FASTA file gets its
      own shard. 

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    """
        if isinstance(input_files, str):
            input_files = [input_files]

        def shard_generator():
            for input_file in input_files:
                X = encode_fasta_sequence(input_file)
                ids = np.ones(len(X))
                # (X, y, w, ids)
                yield X, None, None, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir)
示例#2
0
    def _to_singletask(dataset, task_dirs):
        """Transforms a multitask dataset to a collection of singletask datasets."""
        tasks = dataset.get_task_names()
        assert len(tasks) == len(task_dirs)
        log("Splitting multitask dataset into singletask datasets",
            dataset.verbose)
        task_datasets = [
            DiskDataset.create_dataset([], task_dirs[task_num], [task])
            for (task_num, task) in enumerate(tasks)
        ]
        #task_metadata_rows = {task: [] for task in tasks}
        for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
            log("Processing shard %d" % shard_num, dataset.verbose)
            basename = "dataset-%d" % shard_num
            for task_num, task in enumerate(tasks):
                log("\tTask %s" % task, dataset.verbose)
                w_task = w[:, task_num]
                y_task = y[:, task_num]

                # Extract those datapoints which are present for this task
                X_nonzero = X[w_task != 0]
                num_datapoints = X_nonzero.shape[0]
                y_nonzero = np.reshape(y_task[w_task != 0],
                                       (num_datapoints, 1))
                w_nonzero = np.reshape(w_task[w_task != 0],
                                       (num_datapoints, 1))
                ids_nonzero = ids[w_task != 0]

                task_datasets[task_num].add_shard(X_nonzero, y_nonzero,
                                                  w_nonzero, ids_nonzero)

        return task_datasets
示例#3
0
    def featurize(self, input_files, data_dir=None, shard_size=8192):
        """Featurize provided files and write to specified location."""
        log("Loading raw samples now.", self.verbose)
        log("shard_size: %d" % shard_size, self.verbose)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids, y, w = convert_df_to_numpy(shard, self.tasks,
                                                self.id_field)
                # Filter out examples where featurization failed.
                ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
                assert len(X) == len(ids) == len(y) == len(w)
                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir,
                                          self.tasks)
示例#4
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbose)
    task_datasets = [DiskDataset.create_dataset([], task_dirs[task_num], [task])
                    for (task_num, task) in enumerate(tasks)]
    #task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbose)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbose)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        task_datasets[task_num].add_shard(X_nonzero, y_nonzero, w_nonzero,
                                          ids_nonzero)

    return task_datasets
示例#5
0
  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for shard_num, shard in enumerate(
          self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
          # Filter out examples where featurization failed.
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it makes
          # no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

    return DiskDataset.create_dataset(
        shard_generator(), data_dir, self.tasks, verbose=self.verbose)
示例#6
0
    def create_dataset(self, input_files, data_dir=None, shard_size=8192):
        """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `input_files` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
      Number of examples stored in each shard.

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    """
        logger.info("Loading raw samples now.")
        logger.info("shard_size: %d" % shard_size)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self._get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self._featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = _convert_df_to_numpy(shard, self.tasks)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it
                    # makes no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                logger.info("TIMING: featurizing shard %d took %0.3f s" %
                            (shard_num, time2 - time1))
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir,
                                          self.tasks)
示例#7
0
    def create_dataset(self,
                       inputs: Sequence[Any],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = 8192) -> DiskDataset:
        """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    inputs: Sequence[Any]
      List of inputs to process. Entries can be arbitrary objects so long as
      they are understood by `self.featurizer`
    data_dir: str, optional (default None)
      Directory to store featurized dataset.
    shard_size: int, optional (default 8192)
      Number of examples stored in each shard.

    Returns
    -------
    DiskDataset
      A `DiskDataset` object containing a featurized representation of data
      from `inputs`.
    """
        logger.info("Loading raw samples now.")
        logger.info("shard_size: %s" % str(shard_size))

        if not isinstance(inputs, list):
            try:
                inputs = list(inputs)
            except TypeError:
                inputs = [inputs]

        def shard_generator():
            global_index = 0
            for shard_num, shard in enumerate(
                    self._get_shards(inputs, shard_size)):
                time1 = time.time()
                X, y, w, ids = self._featurize_shard(shard, global_index)
                global_index += len(shard)

                time2 = time.time()
                logger.info("TIMING: featurizing shard %d took %0.3f s" %
                            (shard_num, time2 - time1))
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir,
                                          self.tasks)
示例#8
0
    def featurize(self, input_files, data_dir=None, shard_size=8192):
        """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
        log("Loading raw samples now.", self.verbose)
        log("shard_size: %d" % shard_size, self.verbose)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = convert_df_to_numpy(shard, self.tasks,
                                               self.id_field)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it makes
                    # no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(),
                                          data_dir,
                                          self.tasks,
                                          verbose=self.verbose)
示例#9
0
  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

    if not isinstance(input_files, list):
      input_files = [input_files]
    def shard_generator():
      for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
        # Filter out examples where featurization failed.
        ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
        assert len(X) == len(ids) == len(y) == len(w)
        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
            self.verbose)
        yield X, y, w, ids
    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
示例#10
0
  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
示例#11
0
  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
示例#12
0
    def create_dataset(self,
                       input_files: OneOrMany[str],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = 8192) -> DiskDataset:
        """Creates a `Dataset` from input JSON files.

    Parameters
    ----------
    input_files: OneOrMany[str]
      List of JSON filenames.
    data_dir: Optional[str], default None
      Name of directory where featurized data is stored.
    shard_size: Optional[int], default 8192
      Shard size when loading data.

    Returns
    -------
    dataset: dc.data.Dataset
      A `Dataset` object containing a featurized representation of data
      from `input_files`.

    """
        if not isinstance(input_files, list):
            try:
                if isinstance(input_files, str):
                    input_files = [input_files]
                else:
                    input_files = list(input_files)
            except TypeError:
                raise ValueError(
                    "input_files is of an unrecognized form. Must be one filename or a list of filenames."
                )

        def shard_generator():
            """Yield X, y, w, and ids for shards."""
            for shard_num, shard in enumerate(
                    self._get_shards(input_files, shard_size)):

                time1 = time.time()
                X, valid_inds = self._featurize_shard(shard)
                if self.id_field:
                    ids = shard[self.id_field].values
                else:
                    ids = np.ones(len(X))
                ids = ids[valid_inds]

                if len(self.tasks) > 0:
                    # Featurize task results if they exist.
                    y, w = _convert_df_to_numpy(shard, self.tasks)

                    if self.label_field:
                        y = shard[self.label_field]
                    if self.weight_field:
                        w = shard[self.weight_field]

                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it
                    # makes no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                logger.info("TIMING: featurizing shard %d took %0.3f s" %
                            (shard_num, time2 - time1))
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(), data_dir)
示例#13
0
  def create_dataset(self,
                     input_files: OneOrMany[str],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = None) -> DiskDataset:
    """Creates a `Dataset` from input FASTA files.

    At present, FASTA support is limited and doesn't allow for sharding.

    Parameters
    ----------
    input_files: List[str]
      List of fasta files.
    data_dir: str, optional (default None)
      Name of directory where featurized data is stored.
    shard_size: int, optional (default None)
      For now, this argument is ignored and each FASTA file gets its
      own shard.

    Returns
    -------
    DiskDataset
      A `DiskDataset` object containing a featurized representation of data
      from `input_files`.
    """
    if isinstance(input_files, str):
      input_files = [input_files]

    def shard_generator():  # TODO Enable sharding with shard size parameter
      for input_file in input_files:
        if self.legacy:
          X = encode_bio_sequence(input_file)
        else:
          sequences = _read_file(input_file)
          X = self.featurizer(sequences)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    def _read_file(input_file: str):
      """
      Convert the FASTA file to a numpy array of FASTA-format strings.
      """

      # TODO don't convert all sequences into np array (allow shards)
      def _generate_sequences(fasta_file, header_mark=">") -> np.ndarray:
        """
        Uses a fasta_file to create a numpy array of annotated FASTA-format strings
        """
        sequences: np.ndarray = np.array([])
        sequence: np.ndarray = np.array([])
        header_read = False
        for line in fasta_file:
          # Check if line is a header
          if line.startswith(header_mark):  # New header line
            header_read = True
            sequences = _add_sequence(sequences, sequence)
            sequence = np.array([])
          elif header_read:  # Line contains sequence in FASTA format
            if line[-1:] == '\n':  # Check last character in string
              line = line[0:-1]  # Remove last character
            sequence = np.append(sequence, line)
        sequences = _add_sequence(sequences, sequence)  # Add last sequence
        return sequences

      def _add_sequence(sequences: np.ndarray,
                        sequence: np.ndarray) -> np.ndarray:
        # Handle empty sequence
        if sequence is None or len(sequence) <= 0:
          # TODO log attempts to add empty sequences every shard
          return np.array([])
        # Annotate start/stop of sequence
        if self.auto_add_annotations:
          sequence = np.insert(sequence, 0, "[CLS]")
          sequence = np.append(sequence, "[SEP]")
        new_sequence = ''.join(sequence)
        new_sequences = np.append(sequences, new_sequence)
        return new_sequences

      with open(input_file, 'r') as f:  # Read FASTA file
        return _generate_sequences(f)

    return DiskDataset.create_dataset(shard_generator(), data_dir)
示例#14
0
  def create_dataset(self,
                     inputs: OneOrMany[Any],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192) -> Dataset:
    """Creates and returns a `Dataset` object by featurizing provided sdf files.

    Parameters
    ----------
    inputs: List
      List of inputs to process. Entries can be filenames or arbitrary objects.
      Each file should be supported format (.sdf) or compressed folder of
      .sdf files
    data_dir: str, optional (default None)
      Directory to store featurized dataset.
    shard_size: int, optional (default 8192)
      Number of examples stored in each shard.

    Returns
    -------
    DiskDataset
      A `DiskDataset` object containing a featurized representation of data
      from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %s" % str(shard_size))

    # Special case handling of single input
    if not isinstance(inputs, list):
      inputs = [inputs]

    processed_files = []
    for input_file in inputs:
      filename, extension = os.path.splitext(input_file)
      extension = extension.lower()
      if extension == ".sdf":
        processed_files.append(input_file)
      elif extension == ".zip":
        zip_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(input_file, 'r')
        zip_ref.extractall(path=zip_dir)
        zip_ref.close()
        zip_files = [os.path.join(zip_dir, name) for name in zip_ref.namelist()]
        for zip_file in zip_files:
          _, extension = os.path.splitext(zip_file)
          extension = extension.lower()
          if extension in [".sdf"]:
            processed_files.append(zip_file)
      else:
        raise ValueError("Unsupported file format")

    inputs = processed_files

    def shard_generator():
      for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = _convert_df_to_numpy(shard, self.tasks)
          # Filter out examples where featurization failed.
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it
          # makes no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        logger.info("TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1))
        yield X, y, w, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)