def create_dataset(self, input_files: OneOrMany[str], data_dir: Optional[str] = None, shard_size: Optional[int] = None) -> DiskDataset: """Creates a `Dataset` from input FASTA files. At present, FASTA support is limited and only allows for one-hot featurization, and doesn't allow for sharding. Parameters ---------- input_files: list List of fasta files. data_dir: str, optional Name of directory where featurized data is stored. shard_size: int, optional For now, this argument is ignored and each FASTA file gets its own shard. Returns ------- A `Dataset` object containing a featurized representation of data from `input_files`. """ if isinstance(input_files, str): input_files = [input_files] def shard_generator(): for input_file in input_files: X = encode_fasta_sequence(input_file) ids = np.ones(len(X)) # (X, y, w, ids) yield X, None, None, ids return DiskDataset.create_dataset(shard_generator(), data_dir)
def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbose) task_datasets = [ DiskDataset.create_dataset([], task_dirs[task_num], [task]) for (task_num, task) in enumerate(tasks) ] #task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbose) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbose) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] task_datasets[task_num].add_shard(X_nonzero, y_nonzero, w_nonzero, ids_nonzero) return task_datasets
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbose) task_datasets = [DiskDataset.create_dataset([], task_dirs[task_num], [task]) for (task_num, task) in enumerate(tasks)] #task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbose) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbose) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] task_datasets[task_num].add_shard(X_nonzero, y_nonzero, w_nonzero, ids_nonzero) return task_datasets
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset( shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def create_dataset(self, input_files, data_dir=None, shard_size=8192): """Creates and returns a `Dataset` object by featurizing provided files. Reads in `input_files` and uses `self.featurizer` to featurize the data in these input files. For large files, automatically shards into smaller chunks of `shard_size` datapoints for convenience. Returns a `Dataset` object that contains the featurized dataset. This implementation assumes that the helper methods `_get_shards` and `_featurize_shard` are implemented and that each shard returned by `_get_shards` is a pandas dataframe. You may choose to reuse or override this method in your subclass implementations. Parameters ---------- input_files: list List of input filenames. data_dir: str, optional Directory to store featurized dataset. shard_size: int, optional Number of examples stored in each shard. Returns ------- A `Dataset` object containing a featurized representation of data from `input_files`. """ logger.info("Loading raw samples now.") logger.info("shard_size: %d" % shard_size) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self._get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self._featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = _convert_df_to_numpy(shard, self.tasks) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it # makes no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def create_dataset(self, inputs: Sequence[Any], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192) -> DiskDataset: """Creates and returns a `Dataset` object by featurizing provided files. Reads in `inputs` and uses `self.featurizer` to featurize the data in these input files. For large files, automatically shards into smaller chunks of `shard_size` datapoints for convenience. Returns a `Dataset` object that contains the featurized dataset. This implementation assumes that the helper methods `_get_shards` and `_featurize_shard` are implemented and that each shard returned by `_get_shards` is a pandas dataframe. You may choose to reuse or override this method in your subclass implementations. Parameters ---------- inputs: Sequence[Any] List of inputs to process. Entries can be arbitrary objects so long as they are understood by `self.featurizer` data_dir: str, optional (default None) Directory to store featurized dataset. shard_size: int, optional (default 8192) Number of examples stored in each shard. Returns ------- DiskDataset A `DiskDataset` object containing a featurized representation of data from `inputs`. """ logger.info("Loading raw samples now.") logger.info("shard_size: %s" % str(shard_size)) if not isinstance(inputs, list): try: inputs = list(inputs) except TypeError: inputs = [inputs] def shard_generator(): global_index = 0 for shard_num, shard in enumerate( self._get_shards(inputs, shard_size)): time1 = time.time() X, y, w, ids = self._featurize_shard(shard, global_index) global_index += len(shard) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location. For large datasets, automatically shards into smaller chunks for convenience. Parameters ---------- input_files: list List of input filenames. data_dir: str (Optional) Directory to store featurized dataset. shard_size: int (Optional) Number of examples stored in each shard. """ log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) time2 = time.time() log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def featurize(self, input_files, data_dir=None): """Featurizes fasta files. Parameters ---------- input_files: list List of fasta files. data_dir: str (Optional) Name of directory where featurized data is stored. """ if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for input_file in input_files: X = encode_fasta_sequence(input_file) ids = np.ones(len(X)) # (X, y, w, ids) yield X, None, None, ids return DiskDataset.create_dataset(shard_generator(), data_dir)
def create_dataset(self, input_files: OneOrMany[str], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192) -> DiskDataset: """Creates a `Dataset` from input JSON files. Parameters ---------- input_files: OneOrMany[str] List of JSON filenames. data_dir: Optional[str], default None Name of directory where featurized data is stored. shard_size: Optional[int], default 8192 Shard size when loading data. Returns ------- dataset: dc.data.Dataset A `Dataset` object containing a featurized representation of data from `input_files`. """ if not isinstance(input_files, list): try: if isinstance(input_files, str): input_files = [input_files] else: input_files = list(input_files) except TypeError: raise ValueError( "input_files is of an unrecognized form. Must be one filename or a list of filenames." ) def shard_generator(): """Yield X, y, w, and ids for shards.""" for shard_num, shard in enumerate( self._get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self._featurize_shard(shard) if self.id_field: ids = shard[self.id_field].values else: ids = np.ones(len(X)) ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results if they exist. y, w = _convert_df_to_numpy(shard, self.tasks) if self.label_field: y = shard[self.label_field] if self.weight_field: w = shard[self.weight_field] # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it # makes no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir)
def create_dataset(self, input_files: OneOrMany[str], data_dir: Optional[str] = None, shard_size: Optional[int] = None) -> DiskDataset: """Creates a `Dataset` from input FASTA files. At present, FASTA support is limited and doesn't allow for sharding. Parameters ---------- input_files: List[str] List of fasta files. data_dir: str, optional (default None) Name of directory where featurized data is stored. shard_size: int, optional (default None) For now, this argument is ignored and each FASTA file gets its own shard. Returns ------- DiskDataset A `DiskDataset` object containing a featurized representation of data from `input_files`. """ if isinstance(input_files, str): input_files = [input_files] def shard_generator(): # TODO Enable sharding with shard size parameter for input_file in input_files: if self.legacy: X = encode_bio_sequence(input_file) else: sequences = _read_file(input_file) X = self.featurizer(sequences) ids = np.ones(len(X)) # (X, y, w, ids) yield X, None, None, ids def _read_file(input_file: str): """ Convert the FASTA file to a numpy array of FASTA-format strings. """ # TODO don't convert all sequences into np array (allow shards) def _generate_sequences(fasta_file, header_mark=">") -> np.ndarray: """ Uses a fasta_file to create a numpy array of annotated FASTA-format strings """ sequences: np.ndarray = np.array([]) sequence: np.ndarray = np.array([]) header_read = False for line in fasta_file: # Check if line is a header if line.startswith(header_mark): # New header line header_read = True sequences = _add_sequence(sequences, sequence) sequence = np.array([]) elif header_read: # Line contains sequence in FASTA format if line[-1:] == '\n': # Check last character in string line = line[0:-1] # Remove last character sequence = np.append(sequence, line) sequences = _add_sequence(sequences, sequence) # Add last sequence return sequences def _add_sequence(sequences: np.ndarray, sequence: np.ndarray) -> np.ndarray: # Handle empty sequence if sequence is None or len(sequence) <= 0: # TODO log attempts to add empty sequences every shard return np.array([]) # Annotate start/stop of sequence if self.auto_add_annotations: sequence = np.insert(sequence, 0, "[CLS]") sequence = np.append(sequence, "[SEP]") new_sequence = ''.join(sequence) new_sequences = np.append(sequences, new_sequence) return new_sequences with open(input_file, 'r') as f: # Read FASTA file return _generate_sequences(f) return DiskDataset.create_dataset(shard_generator(), data_dir)
def create_dataset(self, inputs: OneOrMany[Any], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192) -> Dataset: """Creates and returns a `Dataset` object by featurizing provided sdf files. Parameters ---------- inputs: List List of inputs to process. Entries can be filenames or arbitrary objects. Each file should be supported format (.sdf) or compressed folder of .sdf files data_dir: str, optional (default None) Directory to store featurized dataset. shard_size: int, optional (default 8192) Number of examples stored in each shard. Returns ------- DiskDataset A `DiskDataset` object containing a featurized representation of data from `inputs`. """ logger.info("Loading raw samples now.") logger.info("shard_size: %s" % str(shard_size)) # Special case handling of single input if not isinstance(inputs, list): inputs = [inputs] processed_files = [] for input_file in inputs: filename, extension = os.path.splitext(input_file) extension = extension.lower() if extension == ".sdf": processed_files.append(input_file) elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [os.path.join(zip_dir, name) for name in zip_ref.namelist()] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) extension = extension.lower() if extension in [".sdf"]: processed_files.append(zip_file) else: raise ValueError("Unsupported file format") inputs = processed_files def shard_generator(): for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)): time1 = time.time() X, valid_inds = self._featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = _convert_df_to_numpy(shard, self.tasks) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it # makes no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)