def subscribe(self, var, vocabulary): """ Example: vocabulary = {'x': 'longitude', 'y': 'latitude'} """ if not isinstance(var, Variable): raise ProgressiveError('Expecting a Variable module') if not isinstance(vocabulary, dict): raise ProgressiveError('Expecting a dictionary') if frozenset(vocabulary.keys()) != self._key or not all_string( vocabulary.values()): raise ProgressiveError('Inconsistent vocabulary') self._subscriptions.append((var, vocabulary))
def __init__(self, init_val: Optional[Dict[str, Any]] = None, translation: Optional[Dict[str, Any]] = None, **kwds: Any) -> None: super().__init__(**kwds) self.tags.add(self.TAG_INPUT) self._has_input = False if not (translation is None or isinstance(translation, dict)): raise ProgressiveError("translation must be a dictionary") self._translation = translation if not (init_val is None or isinstance(init_val, dict)): raise ProgressiveError("init_val must be a dictionary") self.result = PsDict({} if init_val is None else init_val)
def from_input(self, input_): if not isinstance(input_, dict): raise ProgressiveError('Expecting a dictionary') for var, vocabulary in self._subscriptions: translation = {vocabulary[k]: v for k, v in input_.items()} var.from_input(translation) return ''
def __init__(self, names, **kwds): if not all_string(names): raise ProgressiveError( 'names {} must be a set of strings'.format(names)) self._names = names self._key = frozenset(names) self._subscriptions = [] table = None super(VirtualVariable, self).__init__(table, **kwds)
def get_dataset(name: str, **kwds: Any) -> str: if not os.path.isdir(DATA_DIR): os.mkdir(DATA_DIR) if name == "bigfile": kw = _check_kwds(kwds, rows=1_000_000, cols=30) return generate_random_csv("%s/bigfile.csv" % DATA_DIR, **kw) if name == "bigfile_multiscale": kw = _check_kwds(kwds, rows=5_000_000) return generate_multiscale_random_csv( "%s/bigfile_multiscale.csv" % DATA_DIR, **kw) if name == "bigfile_mvn": kw = _check_kwds(kwds, rows=900_000) return generate_random_multivariate_normal_csv( "%s/bigfile_mvn.csv" % DATA_DIR, **kw) if name == "smallfile": kw = _check_kwds(kwds, rows=30_000, cols=10) return generate_random_csv("%s/smallfile.csv" % DATA_DIR, **kw) if name == "warlogs": return wget_file( filename="%s/warlogs.vec.bz2" % DATA_DIR, url= "http://www.cs.ubc.ca/labs/imager/video/2014/QSNE/warlogs.vec.bz2", **kwds, ) if name == "mnist_784": # This file [mnist_784.csv] is made available under the Public Domain # Dedication and License v1.0 whose full text can be found at: http://opendatacommons.org/licenses/pddl/1.0/ return wget_file( filename="%s/mnist_784.csv" % DATA_DIR, url="https://datahub.io/machine-learning/mnist_784/r/mnist_784.csv", **kwds, ) if name == "nyc_taxis": nyc_taxis_file = f"{DATA_DIR}/nyc_taxis.csv" if not os.path.exists(nyc_taxis_file): df = pd.read_csv( "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-01.csv", index_col=False, nrows=200_000, ) df.to_csv(nyc_taxis_file) return nyc_taxis_file if name.startswith("cluster:"): fname = name[len("cluster:"):] + ".txt" return wget_file( filename="%s/%s" % (DATA_DIR, fname), url="http://cs.joensuu.fi/sipu/datasets/%s" % fname, ) raise ProgressiveError("Unknown dataset %s" % name)
def __init__(self, filepath_or_buffer=None, filter_=None, force_valid_ids=True, fillvalues=None, timeout=None, save_context=None, recovery=0, recovery_table_size=3, save_step_size=100000, **kwds): self._add_slots( kwds, 'input_descriptors', [SlotDescriptor('filenames', type=Table, required=False)]) super(CSVLoader, self).__init__(**kwds) self.default_step_size = kwds.get('chunksize', 1000) # initial guess kwds.setdefault('chunksize', self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds = self._filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids self.parser = None self.csv_kwds = csv_kwds self._compression = csv_kwds.get('compression', "infer") csv_kwds['compression'] = None self._encoding = csv_kwds.get('encoding', None) csv_kwds['encoding'] = None self._rows_read = 0 if filter_ is not None and not callable(filter_): raise ProgressiveError( 'filter parameter should be callable or None') self._filter = filter_ self._input_stream = None # stream that returns a position through the 'tell()' method self._input_encoding = None self._input_compression = None self._input_size = 0 # length of the file or input stream when available self._timeout = timeout self._table_params = dict(name=self.name, fillvalues=fillvalues) self._save_context = True if save_context is None and is_recoverable( filepath_or_buffer) else False self._recovery = recovery self._recovery_table_size = recovery_table_size self._recovery_table = None self._recovery_table_inv = None self._save_step_size = save_step_size self._last_saved_id = 0 self._table = None
async def from_input(self, input_: JSon) -> str: if not isinstance(input_, dict): raise ProgressiveError("Expecting a dictionary") last = PsDict(self.psdict) # shallow copy values = input_ if self._translation is not None: res = {} for k, v in values.items(): for syn in self._translation[k]: res[syn] = v values = res for (k, v) in input_.items(): last[k] = v await self.scheduler().for_input(self) self.psdict.update(values) self._has_input = True return ""
def set_centroid(self, c, values): try: c = int(c) except: pass centroids = self._table #idx = centroids.id_to_index(c) if len(values) != len(self.columns): raise ProgressiveError('Expected %s of values, received %s', len(self.columns), values) _ = self.scheduler().for_input(self) centroids.loc[c, self.columns] = values #TODO unpack the table self.mbk.cluster_centers_[c] = centroids.loc[c, self.columns] return values
def __init__(self, filepath_or_buffer: Optional[Any] = None, filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, force_valid_ids: bool = True, fillvalues: Optional[Dict[str, Any]] = None, throttle: Union[bool, int, float] = False, **kwds: Any) -> None: super().__init__(**kwds) self.default_step_size = kwds.get("chunksize", 1000) # initial guess kwds.setdefault("chunksize", self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds: Dict[str, Any] = filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids if throttle and isinstance(throttle, integer_types + (float, )): self.throttle = throttle else: self.throttle = False self.parser: Optional[pd.TextReader] = None self.csv_kwds = csv_kwds self._compression: Any = csv_kwds.get("compression", "infer") csv_kwds["compression"] = None self._encoding: Any = csv_kwds.get("encoding", None) csv_kwds["encoding"] = None self._nrows = csv_kwds.get("nrows") csv_kwds["nrows"] = None # nrows clashes with chunksize self._rows_read = 0 if filter_ is not None and not callable(filter_): raise ProgressiveError( "filter parameter should be callable or None") self._filter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = filter_ self._input_stream: Optional[ io. IOBase] = None # stream that returns a position through the 'tell()' method self._input_encoding: Optional[str] = None self._input_compression: Optional[str] = None self._input_size = 0 # length of the file or input stream when available self._file_mode = False self._table_params: Dict[str, Any] = dict(name=self.name, fillvalues=fillvalues)
def set_centroid(self, c: int, values: List[float]) -> List[float]: try: c = int(c) except ValueError: pass centroids = self.table # idx = centroids.id_to_index(c) dfslot = self.get_input_slot("table") input_df = dfslot.data() columns = self.get_columns(input_df, "table") if len(values) != len(columns): raise ProgressiveError( f"Expected {len(columns)} values, received {values}") centroids.loc[c, columns] = values # TODO unpack the table centers = centroids.loc[c, columns] assert isinstance(centers, BaseTable) self.mbk.cluster_centers_[c] = list(centers) return self.mbk.cluster_centers_.tolist()
async def from_input(self, input_: JSon) -> str: if not isinstance(input_, dict): raise ProgressiveError("Expecting a dictionary") if self.result is None and self.get_input_slot("like") is None: error = f"Variable {self.name} with no initial value and no input slot" logger.error(error) return error if self.result is None: error = f"Variable {self.name} has to run once before receiving input" logger.error(error) return error last: PsDict = copy.copy(self.psdict) error = "" for (k, v) in input_.items(): if k in last: last[k] = v else: error += f"Invalid key {k} ignored. " await self.scheduler().for_input(self) self.psdict.update(last) return error
def get_dataset(name, **kwds): if not os.path.isdir(DATA_DIR): os.mkdir(DATA_DIR) if name == 'bigfile': return generate_random_csv('%s/bigfile.csv' % DATA_DIR, 1000000, 30) if name == 'bigfile_mvn': return generate_random_multivariate_normal_csv( '%s/bigfile_mvn.csv' % DATA_DIR, 900000) if name == 'smallfile': return generate_random_csv('%s/smallfile.csv' % DATA_DIR, 30000, 10) if name == 'warlogs': return wget_file( filename='%s/warlogs.vec.bz2' % DATA_DIR, url= 'http://www.cs.ubc.ca/labs/imager/video/2014/QSNE/warlogs.vec.bz2', **kwds) if name.startswith('cluster:'): fname = name[len('cluster:'):] + ".txt" return wget_file(filename='%s/%s' % (DATA_DIR, fname), url='http://cs.joensuu.fi/sipu/datasets/%s' % fname) raise ProgressiveError('Unknown dataset %s' % name)
def from_input(self, input_): if not isinstance(input_, dict): raise ProgressiveError('Expecting a dictionary') if self._table is None and self.get_input_slot('like') is None: error = 'Variable %s with no initial value and no input slot' % self.name logger.error(error) return error last = self._table.last() if last is None: last = {v: None for v in self._table.columns} else: last = last.to_json() error = '' for (k, v) in six.iteritems(input_): if k in last: last[k] = v else: error += 'Invalid key %s ignored. ' % k _ = self.scheduler().for_input(self) #last['_update'] = run_number self._table.add(last) return error
def from_input(self, input): _ = input if not isinstance(input,dict): raise ProgressiveError('Expecting a dictionary') if self._table is None: error = 'AddToRow %s with no initial value and no input slot'%self.name logger.error(error) return error run_number = 0 for (row_, value) in six.iteritems(input): #self._df.loc[row, self.get_columns(self._df)] += value current_row = self._table.row(row_).to_dict(ordered=True) vals = np.array(list(current_row.values())) vals += value self._table.loc[row_, :] = vals # TODO: implement __iadd__() on Table if run_number == 0: run_number = self.scheduler().for_input(self) #self._df.at[row, UPDATE_COLUMN] = run_number if run_number != 0: self._last_update = run_number return "OK"
def validate_parser(self, run_number: int) -> ModuleState: if self.parser is None: if self.filepath_or_buffer is not None: try: self.parser = pd.read_csv( self.open(self.filepath_or_buffer), **self.csv_kwds) except IOError as e: logger.error("Cannot open file %s: %s", self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None self._file_mode = True else: if not self.has_input_slot("filenames"): return self.state_terminated fn_slot = self.get_input_slot("filenames") if fn_slot.output_module is None: return self.state_terminated fn_slot.update(run_number) if fn_slot.deleted.any() or fn_slot.updated.any(): raise ProgressiveError("Cannot handle input file changes") df = fn_slot.data() while self.parser is None: indices = fn_slot.created.next(length=1) assert isinstance(indices, slice) if indices.stop == indices.start: return self.state_blocked filename = df.at[indices.start, "filename"] try: self.parser = pd.read_csv(self.open(filename), **self.csv_kwds) except IOError as e: logger.error("Cannot open file %s: %s", filename, e) self.parser = None # fall through return self.state_ready
def validate_parser(self, run_number): if self.parser is None: if self.filepath_or_buffer is not None: if not self._recovery: try: self.parser = read_csv( self.create_input_source(self.filepath_or_buffer), **self.csv_kwds) except IOError as e: logger.error('Cannot open file %s: %s', self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: # do recovery try: if self._recovery_table is None: self._recovery_table = Table( name='csv_loader_recovery', create=False) if self._recovery_table_inv is None: self._recovery_table_inv = Table( name='csv_loader_recovery_invariant', create=False) if self._table is None: self._table_params[ 'name'] = self._recovery_table_inv[ 'table_name'].loc[0] self._table_params['create'] = False self._table = Table(**self._table_params) except Exception as e: # TODO: specify the exception? logger.error('Cannot acces recovery table %s', e) return self.state_terminated try: last_ = self._recovery_table.eval("last_id=={}".format( len(self._table)), as_slice=False) len_last = len(last_) if len_last > 1: logger.error("Inconsistent recovery table") return self.state_terminated #last_ = self._recovery_table.argmax()['offset'] snapshot = None if len_last == 1: snapshot = self._recovery_table.row( last_[0]).to_dict(ordered=True) if not check_snapshot(snapshot): snapshot = None if snapshot is None: # i.e. snapshot not yet found or inconsistent max_ = -1 for i in self._recovery_table.eval( "last_id<{}".format(len(self._table)), as_slice=False): sn = self._recovery_table.row(i).to_dict( ordered=True) if check_snapshot(sn) and sn['last_id'] > max_: max_, snapshot = sn['last_id'], sn if max_ < 0: logger.error('Cannot acces recovery table') return self.state_terminated self._table.drop(slice(max_, None, None)) self._recovered_csv_table_name = snapshot['table_name'] except Exception as e: logger.error('Cannot read the snapshot %s', e) return self.state_terminated try: self.parser = recovery(snapshot, self.filepath_or_buffer, **self.csv_kwds) except Exception as e: #print('Cannot recover from snapshot {}, {}'.format(snapshot, e)) logger.error('Cannot recover from snapshot %s, %s', snapshot, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: # this case does not support recovery fn_slot = self.get_input_slot('filenames') if fn_slot is None or fn_slot.output_module is None: return self.state_terminated with fn_slot.lock: fn_slot.update(run_number) if fn_slot.deleted.any() or fn_slot.updated.any(): raise ProgressiveError( 'Cannot handle input file changes') df = fn_slot.data() while self.parser is None: indices = fn_slot.created.next(1) if indices.stop == indices.start: return self.state_blocked filename = df.at[indices.start, 'filename'] try: self.parser = read_csv( self.create_input_source(filename), **self.csv_kwds) except IOError as e: logger.error('Cannot open file %s: %s', filename, e) self.parser = None # fall through return self.state_ready