def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True): """Creates a new DiskDataset Parameters ---------- shard_generator: Iterable An iterable (either a list or generator) that provides tuples of data (X, y, w, ids). Each tuple will be written to a separate shard on disk. data_dir: str Filename for data directory. Creates a temp directory if none specified. tasks: list List of tasks for this dataset. """ if data_dir is None: data_dir = tempfile.mkdtemp() elif not os.path.exists(data_dir): os.makedirs(data_dir) metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) save_metadata(tasks, metadata_df, data_dir) time2 = time.time() log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose) return DiskDataset(data_dir, verbose=verbose)
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")
def save_to_disk(self): """Save dataset to disk.""" save_metadata(self.tasks, self.metadata_df, self.data_dir)