Exemplo n.º 1
0
 def get_shard_size(self):
   """Gets size of shards on disk."""
   if not len(self.metadata_df):
     raise ValueError("No data in dataset.")
   sample_y = load_from_disk(
       os.path.join(self.data_dir,
                    next(self.metadata_df.iterrows())[1]['y']))
   return len(sample_y)
Exemplo n.º 2
0
 def __len__(self):
   """
   Finds number of elements in dataset.
   """
   total = 0
   for _, row in self.metadata_df.iterrows():
     y = load_from_disk(os.path.join(self.data_dir, row['ids']))
     total += len(y)
   return total
Exemplo n.º 3
0
 def get_data_shape(self):
   """
   Gets array shape of datapoints in this dataset.
   """
   if not len(self.metadata_df):
     raise ValueError("No data in dataset.")
   sample_X = load_from_disk(
       os.path.join(self.data_dir,
                    next(self.metadata_df.iterrows())[1]['X']))
   return torch.shape(sample_X)[1:]
Exemplo n.º 4
0
 def iterate(dataset):
   for _, row in dataset.metadata_df.iterrows():
     X = torch.Tensor(load_from_disk(os.path.join(dataset.data_dir, row['X'])))
     ids = torch.Tensor(
         load_from_disk(os.path.join(dataset.data_dir, row['ids'])),
         dtype=object)
     # These columns may be missing is the dataset is unlabelled.
     if row['y'] is not None:
       y = torch.Tensor(load_from_disk(os.path.join(dataset.data_dir, row['y'])))
     else:
       y = None
     if row['w'] is not None:
       w_filename = os.path.join(dataset.data_dir, row['w'])
       if os.path.exists(w_filename):
         w = torch.Tensor(load_from_disk(w_filename))
       else:
         w = torch.ones(y.shape)
     else:
       w = None
     yield (X, y, w, ids)
Exemplo n.º 5
0
  def get_shard(self, i):
    """Retrieves data for the i-th shard from disk."""
    row = self.metadata_df.iloc[i]
    X = torch.Tensor(load_from_disk(os.path.join(self.data_dir, row['X'])))

    if row['y'] is not None:
      y = torch.Tensor(load_from_disk(os.path.join(self.data_dir, row['y'])))
    else:
      y = None

    if row['w'] is not None:
      # TODO (ytz): Under what condition does this exist but the file itself doesn't?
      w_filename = os.path.join(self.data_dir, row['w'])
      if os.path.exists(w_filename):
        w = torch.Tensor(load_from_disk(w_filename))
      else:
        w = torch.ones(y.shape)
    else:
      w = None

    ids = torch.Tensor(
        load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)
    return (X, y, w, ids)
Exemplo n.º 6
0
  def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk")