Пример #1
0
 def subscribe(self, var, vocabulary):
     """
     Example: vocabulary = {'x': 'longitude', 'y': 'latitude'}
     """
     if not isinstance(var, Variable):
         raise ProgressiveError('Expecting a Variable module')
     if not isinstance(vocabulary, dict):
         raise ProgressiveError('Expecting a dictionary')
     if frozenset(vocabulary.keys()) != self._key or not all_string(
             vocabulary.values()):
         raise ProgressiveError('Inconsistent vocabulary')
     self._subscriptions.append((var, vocabulary))
Пример #2
0
 def __init__(self,
              init_val: Optional[Dict[str, Any]] = None,
              translation: Optional[Dict[str, Any]] = None,
              **kwds: Any) -> None:
     super().__init__(**kwds)
     self.tags.add(self.TAG_INPUT)
     self._has_input = False
     if not (translation is None or isinstance(translation, dict)):
         raise ProgressiveError("translation must be a dictionary")
     self._translation = translation
     if not (init_val is None or isinstance(init_val, dict)):
         raise ProgressiveError("init_val must be a dictionary")
     self.result = PsDict({} if init_val is None else init_val)
Пример #3
0
 def from_input(self, input_):
     if not isinstance(input_, dict):
         raise ProgressiveError('Expecting a dictionary')
     for var, vocabulary in self._subscriptions:
         translation = {vocabulary[k]: v for k, v in input_.items()}
         var.from_input(translation)
     return ''
Пример #4
0
 def __init__(self, names, **kwds):
     if not all_string(names):
         raise ProgressiveError(
             'names {} must be a set of strings'.format(names))
     self._names = names
     self._key = frozenset(names)
     self._subscriptions = []
     table = None
     super(VirtualVariable, self).__init__(table, **kwds)
Пример #5
0
def get_dataset(name: str, **kwds: Any) -> str:
    if not os.path.isdir(DATA_DIR):
        os.mkdir(DATA_DIR)
    if name == "bigfile":
        kw = _check_kwds(kwds, rows=1_000_000, cols=30)
        return generate_random_csv("%s/bigfile.csv" % DATA_DIR, **kw)
    if name == "bigfile_multiscale":
        kw = _check_kwds(kwds, rows=5_000_000)
        return generate_multiscale_random_csv(
            "%s/bigfile_multiscale.csv" % DATA_DIR, **kw)
    if name == "bigfile_mvn":
        kw = _check_kwds(kwds, rows=900_000)
        return generate_random_multivariate_normal_csv(
            "%s/bigfile_mvn.csv" % DATA_DIR, **kw)
    if name == "smallfile":
        kw = _check_kwds(kwds, rows=30_000, cols=10)
        return generate_random_csv("%s/smallfile.csv" % DATA_DIR, **kw)
    if name == "warlogs":
        return wget_file(
            filename="%s/warlogs.vec.bz2" % DATA_DIR,
            url=
            "http://www.cs.ubc.ca/labs/imager/video/2014/QSNE/warlogs.vec.bz2",
            **kwds,
        )
    if name == "mnist_784":
        # This file [mnist_784.csv] is made available under the Public Domain
        # Dedication and License v1.0 whose full text can be found at: http://opendatacommons.org/licenses/pddl/1.0/
        return wget_file(
            filename="%s/mnist_784.csv" % DATA_DIR,
            url="https://datahub.io/machine-learning/mnist_784/r/mnist_784.csv",
            **kwds,
        )
    if name == "nyc_taxis":
        nyc_taxis_file = f"{DATA_DIR}/nyc_taxis.csv"
        if not os.path.exists(nyc_taxis_file):
            df = pd.read_csv(
                "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-01.csv",
                index_col=False,
                nrows=200_000,
            )
            df.to_csv(nyc_taxis_file)
        return nyc_taxis_file
    if name.startswith("cluster:"):
        fname = name[len("cluster:"):] + ".txt"
        return wget_file(
            filename="%s/%s" % (DATA_DIR, fname),
            url="http://cs.joensuu.fi/sipu/datasets/%s" % fname,
        )
    raise ProgressiveError("Unknown dataset %s" % name)
Пример #6
0
 def __init__(self,
              filepath_or_buffer=None,
              filter_=None,
              force_valid_ids=True,
              fillvalues=None,
              timeout=None,
              save_context=None,
              recovery=0,
              recovery_table_size=3,
              save_step_size=100000,
              **kwds):
     self._add_slots(
         kwds, 'input_descriptors',
         [SlotDescriptor('filenames', type=Table, required=False)])
     super(CSVLoader, self).__init__(**kwds)
     self.default_step_size = kwds.get('chunksize', 1000)  # initial guess
     kwds.setdefault('chunksize', self.default_step_size)
     # Filter out the module keywords from the csv loader keywords
     csv_kwds = self._filter_kwds(kwds, pd.read_csv)
     # When called with a specified chunksize, it returns a parser
     self.filepath_or_buffer = filepath_or_buffer
     self.force_valid_ids = force_valid_ids
     self.parser = None
     self.csv_kwds = csv_kwds
     self._compression = csv_kwds.get('compression', "infer")
     csv_kwds['compression'] = None
     self._encoding = csv_kwds.get('encoding', None)
     csv_kwds['encoding'] = None
     self._rows_read = 0
     if filter_ is not None and not callable(filter_):
         raise ProgressiveError(
             'filter parameter should be callable or None')
     self._filter = filter_
     self._input_stream = None  # stream that returns a position through the 'tell()' method
     self._input_encoding = None
     self._input_compression = None
     self._input_size = 0  # length of the file or input stream when available
     self._timeout = timeout
     self._table_params = dict(name=self.name, fillvalues=fillvalues)
     self._save_context = True if save_context is None and is_recoverable(
         filepath_or_buffer) else False
     self._recovery = recovery
     self._recovery_table_size = recovery_table_size
     self._recovery_table = None
     self._recovery_table_inv = None
     self._save_step_size = save_step_size
     self._last_saved_id = 0
     self._table = None
Пример #7
0
 async def from_input(self, input_: JSon) -> str:
     if not isinstance(input_, dict):
         raise ProgressiveError("Expecting a dictionary")
     last = PsDict(self.psdict)  # shallow copy
     values = input_
     if self._translation is not None:
         res = {}
         for k, v in values.items():
             for syn in self._translation[k]:
                 res[syn] = v
         values = res
     for (k, v) in input_.items():
         last[k] = v
     await self.scheduler().for_input(self)
     self.psdict.update(values)
     self._has_input = True
     return ""
Пример #8
0
    def set_centroid(self, c, values):
        try:
            c = int(c)
        except:
            pass

        centroids = self._table
        #idx = centroids.id_to_index(c)

        if len(values) != len(self.columns):
            raise ProgressiveError('Expected %s of values, received %s',
                                   len(self.columns), values)
        _ = self.scheduler().for_input(self)
        centroids.loc[c, self.columns] = values
        #TODO unpack the table
        self.mbk.cluster_centers_[c] = centroids.loc[c, self.columns]
        return values
Пример #9
0
    def __init__(self,
                 filepath_or_buffer: Optional[Any] = None,
                 filter_: Optional[Callable[[pd.DataFrame],
                                            pd.DataFrame]] = None,
                 force_valid_ids: bool = True,
                 fillvalues: Optional[Dict[str, Any]] = None,
                 throttle: Union[bool, int, float] = False,
                 **kwds: Any) -> None:
        super().__init__(**kwds)
        self.default_step_size = kwds.get("chunksize", 1000)  # initial guess
        kwds.setdefault("chunksize", self.default_step_size)
        # Filter out the module keywords from the csv loader keywords
        csv_kwds: Dict[str, Any] = filter_kwds(kwds, pd.read_csv)
        # When called with a specified chunksize, it returns a parser
        self.filepath_or_buffer = filepath_or_buffer
        self.force_valid_ids = force_valid_ids
        if throttle and isinstance(throttle, integer_types + (float, )):
            self.throttle = throttle
        else:
            self.throttle = False
        self.parser: Optional[pd.TextReader] = None
        self.csv_kwds = csv_kwds
        self._compression: Any = csv_kwds.get("compression", "infer")
        csv_kwds["compression"] = None
        self._encoding: Any = csv_kwds.get("encoding", None)
        csv_kwds["encoding"] = None
        self._nrows = csv_kwds.get("nrows")
        csv_kwds["nrows"] = None  # nrows clashes with chunksize

        self._rows_read = 0
        if filter_ is not None and not callable(filter_):
            raise ProgressiveError(
                "filter parameter should be callable or None")
        self._filter: Optional[Callable[[pd.DataFrame],
                                        pd.DataFrame]] = filter_
        self._input_stream: Optional[
            io.
            IOBase] = None  # stream that returns a position through the 'tell()' method
        self._input_encoding: Optional[str] = None
        self._input_compression: Optional[str] = None
        self._input_size = 0  # length of the file or input stream when available
        self._file_mode = False
        self._table_params: Dict[str, Any] = dict(name=self.name,
                                                  fillvalues=fillvalues)
Пример #10
0
    def set_centroid(self, c: int, values: List[float]) -> List[float]:
        try:
            c = int(c)
        except ValueError:
            pass

        centroids = self.table
        # idx = centroids.id_to_index(c)

        dfslot = self.get_input_slot("table")
        input_df = dfslot.data()
        columns = self.get_columns(input_df, "table")
        if len(values) != len(columns):
            raise ProgressiveError(
                f"Expected {len(columns)} values, received {values}")
        centroids.loc[c, columns] = values
        # TODO unpack the table
        centers = centroids.loc[c, columns]
        assert isinstance(centers, BaseTable)
        self.mbk.cluster_centers_[c] = list(centers)
        return self.mbk.cluster_centers_.tolist()
Пример #11
0
 async def from_input(self, input_: JSon) -> str:
     if not isinstance(input_, dict):
         raise ProgressiveError("Expecting a dictionary")
     if self.result is None and self.get_input_slot("like") is None:
         error = f"Variable {self.name} with no initial value and no input slot"
         logger.error(error)
         return error
     if self.result is None:
         error = f"Variable {self.name} has to run once before receiving input"
         logger.error(error)
         return error
     last: PsDict = copy.copy(self.psdict)
     error = ""
     for (k, v) in input_.items():
         if k in last:
             last[k] = v
         else:
             error += f"Invalid key {k} ignored. "
     await self.scheduler().for_input(self)
     self.psdict.update(last)
     return error
Пример #12
0
def get_dataset(name, **kwds):
    if not os.path.isdir(DATA_DIR):
        os.mkdir(DATA_DIR)
    if name == 'bigfile':
        return generate_random_csv('%s/bigfile.csv' % DATA_DIR, 1000000, 30)
    if name == 'bigfile_mvn':
        return generate_random_multivariate_normal_csv(
            '%s/bigfile_mvn.csv' % DATA_DIR, 900000)
    if name == 'smallfile':
        return generate_random_csv('%s/smallfile.csv' % DATA_DIR, 30000, 10)
    if name == 'warlogs':
        return wget_file(
            filename='%s/warlogs.vec.bz2' % DATA_DIR,
            url=
            'http://www.cs.ubc.ca/labs/imager/video/2014/QSNE/warlogs.vec.bz2',
            **kwds)
    if name.startswith('cluster:'):
        fname = name[len('cluster:'):] + ".txt"
        return wget_file(filename='%s/%s' % (DATA_DIR, fname),
                         url='http://cs.joensuu.fi/sipu/datasets/%s' % fname)
    raise ProgressiveError('Unknown dataset %s' % name)
Пример #13
0
 def from_input(self, input_):
     if not isinstance(input_, dict):
         raise ProgressiveError('Expecting a dictionary')
     if self._table is None and self.get_input_slot('like') is None:
         error = 'Variable %s with no initial value and no input slot' % self.name
         logger.error(error)
         return error
     last = self._table.last()
     if last is None:
         last = {v: None for v in self._table.columns}
     else:
         last = last.to_json()
     error = ''
     for (k, v) in six.iteritems(input_):
         if k in last:
             last[k] = v
         else:
             error += 'Invalid key %s ignored. ' % k
     _ = self.scheduler().for_input(self)
     #last['_update'] = run_number
     self._table.add(last)
     return error
Пример #14
0
    def from_input(self, input):
        _ = input
        if not isinstance(input,dict):
            raise ProgressiveError('Expecting a dictionary')
        if self._table is None:
            error = 'AddToRow %s with no initial value and no input slot'%self.name
            logger.error(error)
            return error

        run_number = 0
        for (row_, value) in six.iteritems(input):
            #self._df.loc[row, self.get_columns(self._df)] += value
            current_row = self._table.row(row_).to_dict(ordered=True)
            vals = np.array(list(current_row.values()))
            vals += value
            self._table.loc[row_, :] = vals # TODO: implement __iadd__() on Table
            if run_number == 0:
                run_number = self.scheduler().for_input(self)
            #self._df.at[row, UPDATE_COLUMN] = run_number
        if run_number != 0:
            self._last_update = run_number
        return "OK"
Пример #15
0
 def validate_parser(self, run_number: int) -> ModuleState:
     if self.parser is None:
         if self.filepath_or_buffer is not None:
             try:
                 self.parser = pd.read_csv(
                     self.open(self.filepath_or_buffer), **self.csv_kwds)
             except IOError as e:
                 logger.error("Cannot open file %s: %s",
                              self.filepath_or_buffer, e)
                 self.parser = None
                 return self.state_terminated
             self.filepath_or_buffer = None
             self._file_mode = True
         else:
             if not self.has_input_slot("filenames"):
                 return self.state_terminated
             fn_slot = self.get_input_slot("filenames")
             if fn_slot.output_module is None:
                 return self.state_terminated
             fn_slot.update(run_number)
             if fn_slot.deleted.any() or fn_slot.updated.any():
                 raise ProgressiveError("Cannot handle input file changes")
             df = fn_slot.data()
             while self.parser is None:
                 indices = fn_slot.created.next(length=1)
                 assert isinstance(indices, slice)
                 if indices.stop == indices.start:
                     return self.state_blocked
                 filename = df.at[indices.start, "filename"]
                 try:
                     self.parser = pd.read_csv(self.open(filename),
                                               **self.csv_kwds)
                 except IOError as e:
                     logger.error("Cannot open file %s: %s", filename, e)
                     self.parser = None
                     # fall through
     return self.state_ready
Пример #16
0
    def validate_parser(self, run_number):
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                if not self._recovery:
                    try:
                        self.parser = read_csv(
                            self.create_input_source(self.filepath_or_buffer),
                            **self.csv_kwds)
                    except IOError as e:
                        logger.error('Cannot open file %s: %s',
                                     self.filepath_or_buffer, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None
                else:  # do recovery
                    try:
                        if self._recovery_table is None:
                            self._recovery_table = Table(
                                name='csv_loader_recovery', create=False)
                        if self._recovery_table_inv is None:
                            self._recovery_table_inv = Table(
                                name='csv_loader_recovery_invariant',
                                create=False)
                        if self._table is None:
                            self._table_params[
                                'name'] = self._recovery_table_inv[
                                    'table_name'].loc[0]
                            self._table_params['create'] = False
                            self._table = Table(**self._table_params)
                    except Exception as e:  # TODO: specify the exception?
                        logger.error('Cannot acces recovery table %s', e)
                        return self.state_terminated
                    try:
                        last_ = self._recovery_table.eval("last_id=={}".format(
                            len(self._table)),
                                                          as_slice=False)
                        len_last = len(last_)
                        if len_last > 1:
                            logger.error("Inconsistent recovery table")
                            return self.state_terminated
                        #last_ = self._recovery_table.argmax()['offset']
                        snapshot = None
                        if len_last == 1:
                            snapshot = self._recovery_table.row(
                                last_[0]).to_dict(ordered=True)
                            if not check_snapshot(snapshot):
                                snapshot = None
                        if snapshot is None:  # i.e. snapshot not yet found or inconsistent
                            max_ = -1
                            for i in self._recovery_table.eval(
                                    "last_id<{}".format(len(self._table)),
                                    as_slice=False):
                                sn = self._recovery_table.row(i).to_dict(
                                    ordered=True)
                                if check_snapshot(sn) and sn['last_id'] > max_:
                                    max_, snapshot = sn['last_id'], sn
                            if max_ < 0:
                                logger.error('Cannot acces recovery table')
                                return self.state_terminated
                            self._table.drop(slice(max_, None, None))
                        self._recovered_csv_table_name = snapshot['table_name']
                    except Exception as e:
                        logger.error('Cannot read the snapshot %s', e)
                        return self.state_terminated
                    try:
                        self.parser = recovery(snapshot,
                                               self.filepath_or_buffer,
                                               **self.csv_kwds)
                    except Exception as e:
                        #print('Cannot recover from snapshot {}, {}'.format(snapshot, e))
                        logger.error('Cannot recover from snapshot %s, %s',
                                     snapshot, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None

            else:  # this case does not support recovery
                fn_slot = self.get_input_slot('filenames')
                if fn_slot is None or fn_slot.output_module is None:
                    return self.state_terminated
                with fn_slot.lock:
                    fn_slot.update(run_number)
                    if fn_slot.deleted.any() or fn_slot.updated.any():
                        raise ProgressiveError(
                            'Cannot handle input file changes')
                    df = fn_slot.data()
                    while self.parser is None:
                        indices = fn_slot.created.next(1)
                        if indices.stop == indices.start:
                            return self.state_blocked
                        filename = df.at[indices.start, 'filename']
                        try:
                            self.parser = read_csv(
                                self.create_input_source(filename),
                                **self.csv_kwds)
                        except IOError as e:
                            logger.error('Cannot open file %s: %s', filename,
                                         e)
                            self.parser = None
                        # fall through
        return self.state_ready