def __iter__(self): size = None all_buffer = [] for df in self.source_datapipe: if size is None: size = df_wrapper.get_len(df) for i in range(df_wrapper.get_len(df)): all_buffer.append(df_wrapper.get_item(df, i)) random.shuffle(all_buffer) buffer = [] for df in all_buffer: buffer.append(df) if len(buffer) == size: yield df_wrapper.concat(buffer) buffer = [] if len(buffer): yield df_wrapper.concat(buffer)
def _returnIfTrue(self, data): condition = self.filter_fn(data, *self.args, **self.kwargs) if df_wrapper.is_column(condition): # We are operating on DataFrames filter here result = [] for idx, mask in enumerate(df_wrapper.iterate(condition)): if mask: result.append(df_wrapper.get_item(data, idx)) if len(result): return df_wrapper.concat(result) else: return None if not isinstance(condition, bool): raise ValueError("Boolean output is required for `filter_fn` of FilterIterDataPipe, got", type(condition)) if condition: return data