def __init__(self, **kwds: Any) -> None: super(Wait, self).__init__(**kwds) if np.isnan(self.params.delay) and self.params.reads == -1: raise ProgressiveError( "Module %s needs either a delay or " "a number of reads, not both", self.pretty_typename(), )
def __init__( self, filepath_or_buffer: Optional[Any] = None, filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, force_valid_ids: bool = True, fillvalues: Optional[Dict[str, Any]] = None, as_array: Optional[Any] = None, timeout: Optional[float] = None, save_context: Optional[Any] = None, # FIXME seems more like a bool recovery: int = 0, # FIXME seems more like a bool recovery_tag: Union[str, int] = "", recovery_table_size: int = 3, save_step_size: int = 100000, **kwds: Any, ) -> None: super(CSVLoader, self).__init__(**kwds) self.tags.add(self.TAG_SOURCE) self.default_step_size = kwds.get("chunksize", 1000) # initial guess kwds.setdefault("chunksize", self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds = filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids self.parser: Optional[Parser] = None self.csv_kwds = csv_kwds self._compression = csv_kwds.get("compression", "infer") csv_kwds["compression"] = None self._encoding = csv_kwds.get("encoding", None) csv_kwds["encoding"] = None self._rows_read = 0 if filter_ is not None and not callable(filter_): raise ProgressiveError( "filter parameter should be callable or None") self._filter = filter_ # self._input_stream: Optional[Any] = ( # None # stream that returns a position through the 'tell()' method # ) self._input_encoding = None self._input_compression = None self._input_size = 0 # length of the file or input stream when available self._timeout_csv = timeout self._table_params: Dict[str, Any] = dict(name=self.name, fillvalues=fillvalues) self._as_array = as_array self._save_context = (True if save_context is None and is_recoverable(filepath_or_buffer) else False) self._recovery = recovery self._recovery_table_size = recovery_table_size self._recovery_table: Optional[Table] = None self._recovery_table_name = f"csv_loader_recovery_{recovery_tag}" self._recovery_table_inv: Optional[Table] = None self._recovery_table_inv_name = f"csv_loader_recovery_invariant_{recovery_tag}" self._save_step_size = save_step_size self._last_saved_id = 0 if self._recovery and not self.recovery_tables_exist(): self._recovery = False if not self._recovery: self.trunc_recovery_tables()
def _validate_descriptors(descriptor_list: List[SlotDescriptor]) -> Dict[str, Any]: slots: Dict[str, Any] = {} for desc in descriptor_list: if desc.name in slots: raise ProgressiveError( "Duplicate slot name %s" f" in slot descriptor {desc.name}" ) slots[desc.name] = None return slots
def _add_class( self, name: str, x_column: str, y_column: str, sample: Union[Literal["default"], Module] = "default", sample_slot: str = "result", input_module: Optional[Module] = None, input_slot: Optional[str] = None, ) -> None: if self.input_module is None and input_module is None: raise ProgressiveError("Input module is not defined!") if self.input_module is not None and input_module is not None: raise ProgressiveError("Input module is defined twice!") if self.input_slot is None and input_slot is None: raise ProgressiveError("Input slot is not defined!") if (self.input_slot is not None and input_slot is not None and self.input_slot != input_slot): raise ProgressiveError("Input slot is defined twice!") data_class = _DataClass( name, self, x_column, y_column, approximate=self._approximate, scheduler=self._scheduler, ) data_class.sample = sample input_module = input_module or self.input_module input_slot = input_slot or self.input_slot if input_module is not None and input_slot is not None: data_class.create_dependent_modules(input_module, input_slot) col_translation = {self._x_label: x_column, self._y_label: y_column} hist_meta = dict(inp="hist", class_=name, **col_translation) if data_class.histogram2d is not None: self.input["table", hist_meta] = data_class.histogram2d.output.result if isinstance(data_class.sample, Module): meta = dict(inp="sample", class_=name, **col_translation) self.input["table", meta] = data_class.sample.output[sample_slot] self._data_class_dict[name] = data_class
def collect_dependencies(self) -> Dict[str, Set[str]]: "Return the dependecies of the modules" errors = self.validate() if errors: raise ProgressiveError(f"Invalid dataflow: {errors}") dependencies = {} for valid in self.valid: module = valid.name slots = self.inputs[module] outs = [m.output_module.name for m in slots.values()] dependencies[module] = set(outs) return dependencies
def connect( self, output_module: Module, output_name: str, input_module: Module, input_name: str, ) -> None: "Declare a connection between two modules slots" slot = output_module.create_slot(output_name, input_module, input_name) if not slot.validate_types(): raise ProgressiveError( "Incompatible types for slot (%s,%s) in %s" % (output_name, input_name, str(slot))) self.add_connection(slot)
def add_connection(self, slot: Optional[Slot], rename: bool = True) -> None: "Declare a connection between two module slots" if not slot: return output_module = slot.output_module output_name = slot.output_name input_module = slot.input_module input_name = slot.original_name or slot.input_name if input_module is None: return assert input_name is not None if input_module.input_slot_multiple(input_name): if rename: slot.original_name = input_name input_name += f".{self.multiple_slots_name_generator:04}" self.multiple_slots_name_generator += 1 logger.info(f"{slot.original_name} renamed {input_name}") slot.input_name = input_name else: input_name = slot.input_name if input_name in self.inputs[input_module.name]: if slot is self.inputs[input_module.name][input_name]: logger.warn( "redundant connection:" "Input slot %s already connected to " "slot %s in module %s", input_name, self.inputs[input_module.name][input_name], input_module.name, ) else: raise ProgressiveError( "Input slot %s already connected to" "slot %s in module %s" % ( input_name, self.inputs[input_module.name][input_name], input_module.name, )) assert input_name is not None self.inputs[input_module.name][input_name] = slot if output_module.name not in self.outputs: self.outputs[output_module.name] = {output_name: [slot]} elif output_name not in self.outputs[output_module.name]: self.outputs[output_module.name][output_name] = [slot] else: self.outputs[output_module.name][output_name].append(slot) self.valid = [] # Not sure
def _add_module(self, module: Module) -> None: if module.name in self.inputs: raise ProgressiveError("Module %s already exists" % module.name) self._modules[module.name] = module self.inputs[module.name] = {} self.outputs[module.name] = {}
def __init__( self, name: Optional[str] = None, group: Optional[str] = None, scheduler: Optional[Scheduler] = None, storagegroup: Optional[Group] = None, **kwds: Any, ) -> None: self._args: Sequence[Tuple[str, Any]] self._kwds: Dict[str, Any] if scheduler is None: scheduler = Scheduler.default self._scheduler: Scheduler = scheduler if scheduler.dataflow is None: raise ProgressiveError("No valid context in scheduler") dataflow: Dataflow = scheduler.dataflow if name is None: name = dataflow.generate_name(self.pretty_typename()) elif name in dataflow: raise ProgressiveError( "module already exists in scheduler," " delete it first" ) self.name = name # need to set the name so exception can remove it predictor = TimePredictor.default() predictor.name = name self.predictor = predictor storage = StorageManager.default self.storage = storage if storagegroup is None: assert Group.default_internal is not None storagegroup = Group.default_internal(get_random_name(name + "_tracer")) self.storagegroup: Group = storagegroup tracer = Tracer.default(name, storagegroup) self.tags = set(ModuleTag.tags) self.order: int = -1 self.group: Optional[str] = group or GroupContext.group self.tracer = tracer self._start_time: float = 0 self._end_time: float = 0 self._last_update: int = 0 self._state: ModuleState = Module.state_created self._saved_state: ModuleState = Module.state_invalid self._had_error = False self._parse_parameters(kwds) # always present input_descriptors = self.all_inputs output_descriptors = self.all_outputs self._input_slots: Dict[str, Optional[Slot]] = self._validate_descriptors( input_descriptors ) self.input_descriptors: Dict[str, SlotDescriptor] = { d.name: d for d in input_descriptors } # self.input_multiple: Dict[str, int] = { # d.name: 0 for d in input_descriptors if d.multiple # } self._output_slots: Dict[ str, Optional[List[Slot]] ] = self._validate_descriptors(output_descriptors) self.output_descriptors: Dict[str, SlotDescriptor] = { d.name: d for d in output_descriptors } self.default_step_size: int = 100 self.input = InputSlots(self) self.output = OutputSlots(self) self.steps_acc: int = 0 # self.wait_expr = aio.FIRST_COMPLETED self.context: Optional[_Context] = None # callbacks self._start_run = ModuleCallbackList() self._after_run = ModuleCallbackList() self._ending: List[ModuleCb] = [] # Register module dataflow.add_module(self)
def __setattr__(self, name: str, slot: Slot) -> None: raise ProgressiveError("Output slots cannot be assigned, only read")
def __getitem__(self, name: str) -> Slot: raise ProgressiveError("Input slots cannot be read, only assigned to")
def validate_parser(self, run_number: int) -> ModuleState: if self.parser is None: if self.filepath_or_buffer is not None: if not self._recovery: try: self.parser = read_csv( self.create_input_source(self.filepath_or_buffer), **self.csv_kwds, ) except IOError as e: logger.error("Cannot open file %s: %s", self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: # do recovery try: if self._recovery_table is None: self._recovery_table = Table( name=self._recovery_table_name, create=False) if self._recovery_table_inv is None: self._recovery_table_inv = Table( name=self._recovery_table_inv_name, create=False) if self.result is None: self._table_params[ "name"] = self._recovery_table_inv[ "table_name"].loc[0] self._table_params["create"] = False table = Table(**self._table_params) self.result = table table.last_id except Exception as e: # TODO: specify the exception? logger.error(f"Cannot acces recovery table {e}") return self.state_terminated table = self.table try: last_ = self._recovery_table.eval("last_id=={}".format( len(table)), as_slice=False) len_last = len(last_) if len_last > 1: logger.error("Inconsistent recovery table") return self.state_terminated # last_ = self._recovery_table.argmax()['offset'] snapshot: Optional[Dict[str, Any]] = None if len_last == 1: row = self._recovery_table.row(last_[0]) assert row is not None snapshot = row.to_dict(ordered=True) if not check_snapshot(snapshot): snapshot = None if (snapshot is None ): # i.e. snapshot not yet found or inconsistent max_ = -1 for i in self._recovery_table.eval( "last_id<{}".format(len(table)), as_slice=False): row = self._recovery_table.row(i) assert row is not None sn: Dict[str, Any] = row.to_dict(ordered=True) if check_snapshot(sn) and sn["last_id"] > max_: max_, snapshot = sn["last_id"], sn if max_ < 0: # logger.error('Cannot acces recovery table (max_<0)') return self.state_terminated table.drop(slice(max_ + 1, None, None), truncate=True) assert snapshot self._recovered_csv_table_name = snapshot["table_name"] except Exception as e: logger.error("Cannot read the snapshot %s", e) return self.state_terminated try: self.parser = recovery(snapshot, self.filepath_or_buffer, **self.csv_kwds) except Exception as e: logger.error("Cannot recover from snapshot %s, %s", snapshot, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: # this case does not support recovery fn_slot = None if self.has_input_slot("filenames"): fn_slot = self.get_input_slot("filenames") if fn_slot is None or fn_slot.output_module is None: return self.state_terminated # fn_slot.update(run_number) if fn_slot.deleted.any() or fn_slot.updated.any(): raise ProgressiveError("Cannot handle input file changes") df = fn_slot.data() while self.parser is None: indices = fn_slot.created.next(length=1) assert isinstance(indices, slice) if indices.stop == indices.start: return self.state_blocked filename = df.at[indices.start, "filename"] try: self.parser = read_csv( self.create_input_source(filename), **self.csv_kwds) except IOError as e: logger.error("Cannot open file %s: %s", filename, e) self.parser = None # fall through return self.state_ready