Пример #1
0
    def insert_items(self, val_key_pairs, prepare_inserts=r.prepare_inserts):
        keys = [k for (v, k) in val_key_pairs]
        adjustments, new_keys = prepare_inserts(self._slist, keys)
        if adjustments:
            self.num_update_events += 1
            self.num_updated_keys += len(adjustments)

        # Updating items is a bit tricky: we have to do it without violating order (just changing
        # key of an existing item easily might), so we remove items first. And we can only rely on
        # indices if we scan items in a backwards order.
        items = [
            self._slist.pop(index) for (index, key) in reversed(adjustments)
        ]
        items.reverse()
        for (index, key), item in izip(adjustments, items):
            item.key = key
        self._slist.update(items)

        # Now add the new items.
        self._slist.update(
            Item(val, new_key)
            for (val, _), new_key in izip(val_key_pairs, new_keys))

        # For testing, pass along the return value from prepare_inserts.
        return adjustments, new_keys
Пример #2
0
def safe_str_cmp(a, b):
    """This function compares strings in somewhat constant time.  This
    requires that the length of at least one string is known in advance.

    Returns `True` if the two strings are equal, or `False` if they are not.

    .. versionadded:: 0.7
    """
    if isinstance(a, text_type):
        a = a.encode('utf-8')
    if isinstance(b, text_type):
        b = b.encode('utf-8')

    if _builtin_safe_str_cmp is not None:
        return _builtin_safe_str_cmp(a, b)

    if len(a) != len(b):
        return False

    rv = 0
    if PY2:
        for x, y in izip(a, b):
            rv |= ord(x) ^ ord(y)
    else:
        for x, y in izip(a, b):
            rv |= x ^ y

    return rv == 0
Пример #3
0
def test_adagrad():
    """
    Make sure that learning_rule.AdaGrad obtains the same parameter values as
    with a hand-crafted AdaGrad implementation, given a dummy model and
    learning rate scaler for each parameter.
    Reference:
    "Adaptive subgradient methods for online learning and
    stochastic optimization", Duchi J, Hazan E, Singer Y.
    """

    cost, model, dataset, sgd, state = prepare_adagrad_test()

    def adagrad_manual(model, state):
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['sg2'] += param_val**2
            dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) *
                     param_val)
            rval += [param_val + dx_t]
        return rval

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    manual = adagrad_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
Пример #4
0
    def todok(self):
        from .dok import dok_matrix

        dok = dok_matrix((self.shape), dtype=self.dtype)

        dok.update(izip(izip(self.row, self.col), self.data))

        return dok
Пример #5
0
    def __getitem__(self, key):
        try:
            if isinstance(key, int) and (key >= 0):
                if key in self.cache:
                    return self.cache[key]
                elif key < self.stop:
                    self.stop = 0
                    self.iterator = iter(self.f())

                delta = key - self.stop
                result = next(islice(self.iterator, delta, delta + 1))
                self.cache[key] = result
                self.stop = key + 1
                return result

            elif isinstance(key, slice):
                if key.start is None and key.stop is None:
                    # Whole sequence is asked
                    return list(self.f())
                start = key.start or 0
                step = key.step or 1

                indexes = count(start, step)
                index_upd = start
                while (key.stop is None or index_upd < key.stop) and index_upd in self.cache:
                    index_upd += step

                if index_upd < self.stop and (key.stop is None or index_upd < key.stop):
                    self.iterator = iter(self.f())
                    result = list(islice(self.iterator, start, key.stop, step))
                    for i, value in izip(indexes, result):
                        self.cache[i] = value
                    self.stop = i + 1 if key.stop is None else key.stop
                    return result

                else:
                    result = [self.cache[i] for i in six.moves.xrange(start, index_upd, step)]

                    if key.stop is None:
                        result_upd = list(islice(self.iterator, index_upd - self.stop, None, step))
                    elif index_upd < key.stop:
                        result_upd = list(islice(self.iterator, index_upd - self.stop, key.stop - self.stop, step))
                    else:
                        result_upd = []
                    for i, value in izip(indexes, result_upd):
                        self.cache[i] = value
                    self.stop = key.stop
                    return result + result_upd

            else:
                raise KeyError("Key must be non-negative integer or slice, not {}"
                               .format(key))

        except StopIteration:
            self.iterator = self.f()
            self.stop = 0
            raise
Пример #6
0
def test_adadelta():
    """
    Make sure that learning_rule.AdaDelta obtains the same parameter values as
    with a hand-crafted AdaDelta implementation, given a dummy model and
    learning rate scaler for each parameter.

    Reference:
    "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler.
    """

    # We include a cost other than SumOfParams so that data is actually
    # queried from the training set, and the expected number of updates
    # are applied.
    cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())])
    model = DummyModel(shapes, lr_scalers=scales)
    dataset = ArangeDataset(1)
    decay = 0.95

    sgd = SGD(cost=cost,
              learning_rate=learning_rate,
              learning_rule=AdaDelta(decay),
              batch_size=1)

    sgd.setup(model=model, dataset=dataset)

    state = {}
    for param in model.get_params():
        param_shape = param.get_value().shape
        state[param] = {}
        state[param]['g2'] = np.zeros(param_shape)
        state[param]['dx2'] = np.zeros(param_shape)

    def adadelta_manual(model, state):
        rval = []
        for scale, param in izip(scales, model.get_params()):
            pstate = state[param]
            param_val = param.get_value()
            # begin adadelta
            pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2
            rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate)
            rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate)
            dx_t = -rms_dx_tm1 / rms_g_t * param_val
            pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2
            rval += [param_val + dx_t]
        return rval

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))

    manual = adadelta_manual(model, state)
    sgd.train(dataset=dataset)
    assert all(
        np.allclose(manual_param, sgd_param.get_value())
        for manual_param, sgd_param in izip(manual, model.get_params()))
Пример #7
0
 def get_sample(self, fit, factor=4, num=1):
     vals = numpy.array(fit.model.thawedpars)
     scales = self.scale.get_scales(fit)
     samples = [numpy.random.uniform(val - factor * abs(scale),
                                     val + factor * abs(scale),
                                     int(num)) for val, scale in izip(vals, scales)]
     return numpy.asarray(samples).T
Пример #8
0
    def get_scales(self, fit, myscales=None):

        scales = []
        thawedpars = [par for par in fit.model.pars if not par.frozen]

        if None == myscales:

            oldestmethod = fit.estmethod

            covar = Covariance()
            covar.config['sigma'] = self.sigma
            fit.estmethod = Covariance()

            try:
                r = fit.est_errors()
            finally:
                fit.estmethod = oldestmethod

            for par, val, lo, hi in izip(thawedpars, r.parvals, r.parmins, r.parmaxes):
                scale = None
                if lo is not None and hi is not None:
                    scale = numpy.abs(lo)
                else:
                    warning("Covariance failed for '%s', trying Confidence..." %
                            par.fullname)

                    conf = Confidence()
                    conf.config['sigma'] = self.sigma
                    fit.estmethod = conf
                    try:
                        t = fit.est_errors(parlist=(par,))
                        if t.parmins[0] is not None and t.parmaxes[0] is not None:
                            scale = numpy.abs(t.parmins[0])

                        else:

                            if t.parmins[0] is None and t.parmaxes[0] is not None:
                                scale = numpy.abs(t.parmaxes[0])

                            else:

                                warning('1 sigma bounds for parameter ' +
                                        par.fullname +
                                        ' could not be found, using soft limit minimum')
                                if 0.0 == numpy.abs(par.min):
                                    scale = 1.0e-16
                                else:
                                    scale = numpy.abs(par.min)

                    finally:
                        fit.estmethod = oldestmethod
                scales.append(scale)

        else:
            if not numpy.iterable(myscales):
                raise TypeError(
                    "scales option must be iterable of length %d " % len(thawedpars))
            scales = list(map(abs, myscales))
        scales = numpy.asarray(scales).transpose()
        return scales
Пример #9
0
def azip(*iterables, **kwargs):
    """Move `axis` (default -1) to the front of ndarrays in `iterables`."""
    from six.moves import map as imap, zip as izip
    return izip(*(
        imap(kwargs.get('func', unmask),
             np.rollaxis(i, kwargs.get('axis', -1), kwargs.get('start', 0)))
        if isinstance(i, np.ndarray) else i for i in iterables))
Пример #10
0
 def unpack(self, buff):
     """
     Unpack the given binary buffer into the fields.  The result
     is a dictionary mapping field names to values.
     """
     args = struct.unpack_from(self._fmt, buff[:self._size])
     return dict(izip(self._names, args))
Пример #11
0
    def __init__(self, res, hascontent, duration=0, has_payload = False):
        """
        - **snippets**: An optional dictionary of the form {field: snippet_size} for snippet formatting
        """

        self.total = res[0]
        self.duration = duration
        self.docs = []

        step = 1
        if hascontent:
            step = 3 if has_payload else 2
        else:
            # we can't have nocontent and payloads in the same response
            has_payload = False

        for i in xrange(1, len(res), step):
            id = to_string(res[i])
            payload = to_string(res[i+1]) if has_payload else None
            fields_offset = 2 if has_payload else 1

            fields = {} 
            if hascontent:
                fields = dict(
                    dict(izip(map(to_string, res[i + fields_offset][::2]),
                              map(to_string, res[i + fields_offset][1::2])))
                ) if hascontent else {}
            try:
                del fields['id']
            except KeyError:
                pass

            doc = Document(id, payload=payload, **fields)
            self.docs.append(doc)
Пример #12
0
def _parse_set_weight_values(argvish):

    new_cmd_format, opts, args = validate_args(argvish)

    # We'll either parse the all-in-one-string format or the
    # --options format,
    # but not both. If both are specified, raise an error.
    try:
        devs = []
        if not new_cmd_format:
            if len(args) % 2 != 0:
                print(Commands.set_weight.__doc__.strip())
                exit(EXIT_ERROR)

            devs_and_weights = izip(islice(argvish, 0, len(argvish), 2),
                                    islice(argvish, 1, len(argvish), 2))
            for devstr, weightstr in devs_and_weights:
                devs.extend(builder.search_devs(
                    parse_search_value(devstr)) or [])
                weight = float(weightstr)
                _set_weight_values(devs, weight)
        else:
            if len(args) != 1:
                print(Commands.set_weight.__doc__.strip())
                exit(EXIT_ERROR)

            devs.extend(builder.search_devs(
                parse_search_values_from_opts(opts)) or [])
            weight = float(args[0])
            _set_weight_values(devs, weight)
    except ValueError as e:
        print(e)
        exit(EXIT_ERROR)
Пример #13
0
    def refresh(self, items, consistent=False):
        """
        Overwrite model data with freshest from database

        Parameters
        ----------
        items : list or :class:`~flywheel.models.Model`
            Models to sync
        consistent : bool, optional
            If True, force a consistent read from the db. (default False)

        """
        if isinstance(items, Model):
            items = [items]
        if not items:
            return

        tables = defaultdict(list)
        for item in items:
            tables[item.meta_.ddb_tablename(self.namespace)].append(item)

        for tablename, items in six.iteritems(tables):
            keys = [item.pk_dict_ for item in items]
            results = self.dynamo.batch_get(tablename, keys,
                                            consistent=consistent)
            for item, data in izip(items, results):
                with item.loading_(self):
                    for key, val in data.items():
                        item.set_ddb_val_(key, val)
Пример #14
0
    def eval_model_to_fit(self, modelfuncs):
        total_model = []

        for func, data in izip(modelfuncs, self.datasets):
            total_model.append(data.eval_model_to_fit(func))

        return numpy.concatenate(total_model)
Пример #15
0
    def compactify(self):
        """
        Assign new word ids to all words.

        This is done to make the ids more compact, e.g. after some tokens have
        been removed via :func:`filter_tokens` and there are gaps in the id series.
        Calling this method will remove the gaps.
        """
        logger.debug("rebuilding dictionary, shrinking gaps")

        # build mapping from old id -> new id
        idmap = dict(
            izip(sorted(itervalues(self.token2id)),
                 xrange(len(self.token2id))))

        # reassign mappings to new ids
        self.token2id = {
            token: idmap[tokenid]
            for token, tokenid in iteritems(self.token2id)
        }
        self.id2token = {}
        self.dfs = {
            idmap[tokenid]: freq
            for tokenid, freq in iteritems(self.dfs)
        }
Пример #16
0
def _parse_set_weight_values(argvish):

    new_cmd_format, opts, args = validate_args(argvish)

    # We'll either parse the all-in-one-string format or the
    # --options format,
    # but not both. If both are specified, raise an error.
    try:
        devs = []
        if not new_cmd_format:
            if len(args) % 2 != 0:
                print(Commands.set_weight.__doc__.strip())
                exit(EXIT_ERROR)

            devs_and_weights = izip(islice(argvish, 0, len(argvish), 2),
                                    islice(argvish, 1, len(argvish), 2))
            for devstr, weightstr in devs_and_weights:
                devs.extend(
                    builder.search_devs(parse_search_value(devstr)) or [])
                weight = float(weightstr)
                _set_weight_values(devs, weight)
        else:
            if len(args) != 1:
                print(Commands.set_weight.__doc__.strip())
                exit(EXIT_ERROR)

            devs.extend(
                builder.search_devs(parse_search_values_from_opts(opts)) or [])
            weight = float(args[0])
            _set_weight_values(devs, weight)
    except ValueError as e:
        print(e)
        exit(EXIT_ERROR)
Пример #17
0
def extract_all_features():
    def get_group(block_id):
        group = groupby.get_group(block_id)
        return group

    pool = Pool(processes=56,
                initializer=init_process,
                initargs=(netatmo_groups, netatmo_anns))
    res = list(pool.imap(sleep_30_sec, xrange(56)))
    group_generator = imap(get_group, groups.keys()[:])
    feature_iterator = pool.imap(extract_features, group_generator)

    X, y, block_ids = [], [], []
    save_id = 0
    for block_id, features in izip(groups.keys()[:], tqdm(feature_iterator)):
        group = groupby.get_group(block_id)
        X.append(features)
        y.append(group.iloc[0]['rain'])
        block_ids.append(block_id + (group.iloc[0]["hours_since"], ))

    X = pd.DataFrame(X)
    y = np.array(y)
    block_ids = pd.DataFrame(
        block_ids,
        columns=["city_code", "sq_x", "sq_y", "hour_hash", "hours_since"])
    return X, y, block_ids
Пример #18
0
 def __init__(self, fname, id2word=None, metadata=True):
     self.metadata = metadata
     MmCorpus.__init__(self, fname=fname)
     self.doc_metadata = {}
     self.metadata = metadata
     if not id2word:
         # build a list of all word types in the corpus (distinct words)
         logger.info("extracting vocabulary from the corpus")
         all_terms = set()
         self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
         for doc in self:
             all_terms.update(word for word, wordCnt in doc)
         all_terms = sorted(
             all_terms
         )  # sort the list of all words; rank in that list = word's integer id
         # build a mapping of word id(int) -> word (string)
         self.id2word = dict(izip(xrange(len(all_terms)), all_terms))
     else:
         logger.info("using provided word mapping (%i ids)", len(id2word))
         self.id2word = id2word
     if metadata:
         self.doc_with_meta(fname)
     self.doc_id_to_postgres_id = {}
     self.postgres_id_to_doc_id = {}
     self.__build_relation_dictionaries()
Пример #19
0
 def unpack(self, buff):
     """
     Unpack the given binary buffer into the fields.  The result
     is a dictionary mapping field names to values.
     """
     args = struct.unpack_from(self._fmt, buff[:self._size])
     return dict(izip(self._names, args))
Пример #20
0
    def return_docs(self, return_doc_cb):
        """Return the changed documents and their last change generation
        repeatedly invoking the callback return_doc_cb.

        The final step of a sync exchange.

        :param: return_doc_cb(doc, gen, trans_id): is a callback
                used to return the documents with their last change generation
                to the target replica.
        :return: None
        """
        changes_to_return = self.changes_to_return
        # return docs, including conflicts
        changed_doc_ids = [doc_id for doc_id, _, _ in changes_to_return]
        self._trace('before get_docs')
        docs = self._db.get_docs(
            changed_doc_ids, check_for_conflicts=False, include_deleted=True)

        docs_by_gen = izip(
            docs, (gen for _, gen, _ in changes_to_return),
            (trans_id for _, _, trans_id in changes_to_return))
        _outgoing_trace = []  # for tests
        for doc, gen, trans_id in docs_by_gen:
            return_doc_cb(doc, gen, trans_id)
            _outgoing_trace.append((doc.doc_id, doc.rev))
        # for tests
        self._db._last_exchange_log['return'] = {
            'docs': _outgoing_trace,
            'last_gen': self.new_gen}
Пример #21
0
    def get_gradients(self, model, data, **kwargs):
        """
        Provides the gradients of the cost function with respect to the model
        parameters.

        These are not necessarily those obtained by theano.tensor.grad
        --you may wish to use approximate or even intentionally incorrect
        gradients in some cases.

        Parameters
        ----------
        model : a pylearn2 Model instance
        data : a batch in cost.get_data_specs() form
        kwargs : dict
            Optional extra arguments, not used by the base class.

        Returns
        -------
        gradients : OrderedDict
            a dictionary mapping from the model's parameters
            to their gradients
            The default implementation is to compute the gradients
            using T.grad applied to the value returned by expr.
            However, subclasses may return other values for the gradient.
            For example, an intractable cost may return a sampling-based
            approximation to its gradient.
        updates : OrderedDict
            a dictionary mapping shared variables to updates that must
            be applied to them each time these gradients are computed.
            This is to facilitate computation of sampling-based approximate
            gradients.
            The parameters should never appear in the updates dictionary.
            This would imply that computing their gradient changes
            their value, thus making the gradient value outdated.
        """

        try:
            cost = self.expr(model=model, data=data, **kwargs)
        except TypeError:
            # If anybody knows how to add type(self) to the exception message
            # but still preserve the stack trace, please do so
            # The current code does neither
            message = "Error while calling " + str(type(self)) + ".expr"
            reraise_as(TypeError(message))

        if cost is None:
            raise NotImplementedError(
                str(type(self)) + " represents an intractable cost and "
                "does not provide a gradient "
                "approximation scheme.")

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates
Пример #22
0
def set_arrays(filename, args, fields=None, ascii=True, clobber=False):

    if os.path.isfile(filename) and not clobber:
        raise IOErr("filefound", filename)

    if not numpy.iterable(args) or len(args) == 0:
        raise IOErr('noarrayswrite')

    if not numpy.iterable(args[0]):
        raise IOErr('noarrayswrite')

    size = len(args[0])
    for arg in args:
        if not numpy.iterable(arg):
            raise IOErr('noarrayswrite')
        elif len(arg) != size:
            raise IOErr('arraysnoteq')

    if ascii and '[' not in filename and ']' not in filename:
        filename += "[opt kernel=text/simple]"

    tbl = pycrates.TABLECrate()

    if fields is None:
        fields = ['col%i' % (ii + 1) for ii in range(len(args))]

    if len(args) != len(fields):
        raise IOErr('toomanycols', str(len(fields)), str(len(args)))

    for val, name in izip(args, fields):
        _set_column(tbl, name, val)

    pycrates.write_file(tbl, filename, clobber=True)
    close_crate_dataset(tbl.get_dataset())
Пример #23
0
def lazy_load_trees(skeleton_ids, node_properties):
    """ Return a lazy collection of pairs of (long, DiGraph)
    representing (skeleton_id, tree).
    The node_properties is a list of strings, each being a name of a column
    in the django model of the Treenode table that is not the treenode id, parent_id
    or skeleton_id. """

    values_list = ('id', 'parent_id', 'skeleton_id')
    props = tuple(set(node_properties) - set(values_list))
    values_list += props

    ts = Treenode.objects.filter(skeleton__in=skeleton_ids) \
            .order_by('skeleton') \
            .values_list(*values_list)
    skid = None
    tree = None
    for t in ts:
        if t[2] != skid:
            if tree:
                yield (skid, tree)
            # Prepare for the next one
            skid = t[2]
            tree = DiGraph()

        fields = {k: v for k,v in izip(props, islice(t, 3, 3 + len(props)))}
        tree.add_node(t[0], fields)

        if t[1]:
            # From child to parent
            tree.add_edge(t[0], t[1])

    if tree:
        yield (skid, tree)
Пример #24
0
 def _extract_content(self,
                      extraction_page,
                      start_index,
                      end_index,
                      ignored_regions=None,
                      **kwargs):
     """extract content between annotation indexes"""
     if ignored_regions and (
             _int_cmp(start_index, 'le', ignored_regions[0].start_index)
             and _int_cmp(end_index, 'ge', ignored_regions[-1].end_index)):
         starts = [start_index] + [
             i.end_index for i in ignored_regions if i.end_index is not None
         ]
         ends = [i.start_index for i in ignored_regions]
         if starts[-1] is not None:
             ends.append(end_index)
         included_regions = izip(starts, ends)
         if ends[0] is None:
             included_regions.next()
         regions = starmap(extraction_page.htmlpage_region_inside,
                           included_regions)
         region = FragmentedHtmlPageRegion(extraction_page.htmlpage,
                                           list(regions))
     else:
         region = extraction_page.htmlpage_region_inside(
             start_index, end_index)
     validated = self.content_validate(region)
     return [(self.annotation.surrounds_attribute,
              validated)] if validated else []
Пример #25
0
    def eval_model_to_fit(self, modelfuncs):
        total_model = []

        for func, data in izip(modelfuncs, self.datasets):
            total_model.append(data.eval_model_to_fit(func))

        return numpy.concatenate(total_model)
Пример #26
0
def set_arrays(filename, args, fields=None, ascii=True, clobber=False):

    if os.path.isfile(filename) and not clobber:
        raise IOErr("filefound", filename)

    if not numpy.iterable(args) or len(args) == 0:
        raise IOErr('noarrayswrite')

    if not numpy.iterable(args[0]):
        raise IOErr('noarrayswrite')

    size = len(args[0])
    for arg in args:
        if not numpy.iterable(arg):
            raise IOErr('noarrayswrite')
        elif len(arg) != size:
            raise IOErr('arraysnoteq')

    if ascii and '[' not in filename and ']' not in filename:
        filename += "[opt kernel=text/simple]"

    tbl = pycrates.TABLECrate()

    if fields is None:
        fields = ['col%i' % (ii + 1) for ii in range(len(args))]

    if len(args) != len(fields):
        raise IOErr('toomanycols', str(len(fields)), str(len(args)))

    for val, name in izip(args, fields):
        _set_column(tbl, name, val)

    pycrates.write_file(tbl, filename, clobber=True)
    close_crate_dataset(tbl.get_dataset())
Пример #27
0
    def multiupdate_metadata(self, keys, metadatas):
        """ Update the metadata for a collection of keys.

        Where supported by an implementation, this should perform the whole
        collection of sets as a single transaction.

        Like zip() if keys and metadatas have different lengths, then any excess
        values in the longer list should be silently ignored.

        Parameters
        ----------
        keys : iterable of strings
            The keys for the resources in the key-value store.  Each key is a
            unique identifier for a resource within the key-value store.
        metadatas : iterable of dicts
            An iterator that provides the metadata dictionaries for the
            corresponding keys.

        Events
        ------
        StoreSetEvent :
            On successful completion of a transaction, a StoreSetEvent should be
            emitted with the key & metadata for each key that was set.

        """
        with self.transaction('Updating metadata for '+', '.join('"%s"' % key for key in keys)):
            for key, metadata in izip(keys, metadatas):
                self.update_metadata(key, metadata)
Пример #28
0
 def par_at_boundary( low, val, high, tol ):
     for par_min, par_val, par_max in izip( low, val, high ):
         if sao_fcmp( par_val, par_min, tol ) == 0:
             return True
         if sao_fcmp( par_val, par_max, tol ) == 0:
             return True
     return False
Пример #29
0
def azip(*iterables, **kwargs):
    """Move `axis` (default -1) to the front of ndarrays in `iterables`."""
    from six.moves import map as imap, zip as izip
    return izip(
        *(imap(kwargs.get('func', unmask),
               np.rollaxis(i, kwargs.get('axis', -1), kwargs.get('start', 0))
               ) if isinstance(i, np.ndarray) else i for i in iterables))
Пример #30
0
def build_stacked_ae(nvis,
                     nhids,
                     act_enc,
                     act_dec,
                     tied_weights=False,
                     irange=1e-3,
                     rng=None,
                     corruptor=None,
                     contracting=False):
    """
    .. todo::

        WRITEME properly

    Allocate a stack of autoencoders.
    """
    rng = make_np_rng(rng, which_method='randn')
    layers = []
    final = {}
    # "Broadcast" arguments if they are singular, or accept sequences if
    # they are the same length as nhids
    for c in [
            'corruptor', 'contracting', 'act_enc', 'act_dec', 'tied_weights',
            'irange'
    ]:
        if type(locals()[c]) is not str and hasattr(locals()[c], '__len__'):
            assert len(nhids) == len(locals()[c])
            final[c] = locals()[c]
        else:
            final[c] = [locals()[c]] * len(nhids)
    # The number of visible units in each layer is the initial input
    # size and the first k-1 hidden unit sizes.
    nviss = [nvis] + nhids[:-1]
    seq = izip(
        nhids,
        nviss,
        final['act_enc'],
        final['act_dec'],
        final['corruptor'],
        final['contracting'],
        final['tied_weights'],
        final['irange'],
    )
    # Create each layer.
    for (nhid, nvis, act_enc, act_dec, corr, cae, tied, ir) in seq:
        args = (nvis, nhid, act_enc, act_dec, tied, ir, rng)
        if cae and corr is not None:
            raise ValueError("Can't specify denoising and contracting "
                             "objectives simultaneously")
        elif cae:
            autoenc = ContractiveAutoencoder(*args)
        elif corr is not None:
            autoenc = DenoisingAutoencoder(corr, *args)
        else:
            autoenc = Autoencoder(*args)
        layers.append(autoenc)

    # Create the stack
    return StackedBlocks(layers)
Пример #31
0
    def calc(self, p, x, xhi=None, *args, **kwargs):
        pha = self.pha

        # TODO: this should probably include AREASCAL

        user_grid = False
        try:

            if self._check_for_user_grid(x, xhi):
                user_grid = True
                self._startup_user_grid(x, xhi)

            # Slow
            if self.table is None:
                # again, fit() never comes in here b/c it calls startup()
                src = self.source
                vals = []
                for model, args in izip(self.models, self.grid):
                    elo, ehi = lo, hi = args
                    if pha.units == 'wavelength':
                        lo = DataPHA._hc / ehi
                        hi = DataPHA._hc / elo
                    vals.append(model(src(lo, hi)))
                self.orders = vals
            # Fast
            else:
                xlo, xhi = self.elo, self.ehi
                if pha.units == 'wavelength':
                    xlo, xhi = self.lo, self.hi

                src = self.source(xlo, xhi)  # hi-res grid of all ARF grids

                # Fold summed intervals through the associated response.
                self.orders = \
                    [model(sum_intervals(src, interval[0], interval[1]))
                     for model, interval in izip(self.models, self.table)]

            vals = sum(self.orders)
            if self.mask is not None:
                vals = vals[self.mask]

        finally:
            if user_grid:
                self._teardown_user_grid()

        return vals
Пример #32
0
 def calc(self, p, arglist):
     vals = []
     for model, args in izip(self.models, arglist):
         # FIXME: we're not using p here (and therefore assuming that the
         # parameter values have already been updated to match the contents
         # of p)
         vals.append(model(*args))
     return sum(vals)
Пример #33
0
def populate_connectors(chunkIDs, chunks, cs, connectors):
    # Build up edges via the connectors
    for c in cs:
        # c is (treenode_id, connector_id, relation_id, confidence)
        for chunkID, chunk in izip(chunkIDs, chunks):
            if c[0] in chunk:
                connectors[c[1]][c[2]].append((chunkID, c[3]))
                break
Пример #34
0
 def calc(self, p, arglist):
     vals = []
     for model, args in izip(self.models, arglist):
         # FIXME: we're not using p here (and therefore assuming that the
         # parameter values have already been updated to match the contents
         # of p)
         vals.append(model(*args))
     return sum(vals)
Пример #35
0
    def __str__(self):
        """
        Return a listing of the attributes listed in self._fields and,
        if present, self._extra_fields.
        """

        fields = self._fields + getattr(self, '_extra_fields', ())
        fdict = dict(izip(fields, [getattr(self, f) for f in fields]))
        return print_fields(fields, fdict)
Пример #36
0
 def BulkUpdateRecord(self, table_id, row_ids, columns):
   table_data = self.all_tables[table_id]
   rowid_map = {r:i for i, r in enumerate(table_data.row_ids)}
   table_indices = [rowid_map[r] for r in row_ids]
   for col, values in six.iteritems(columns):
     if col in table_data.columns:
       col_values = table_data.columns[col]
       for i, v in izip(table_indices, values):
         col_values[i] = v
Пример #37
0
    def __str__(self):
        """
        Return a listing of the attributes listed in self._fields and,
        if present, self._extra_fields.
        """

        fields = self._fields + getattr(self, '_extra_fields', ())
        fdict = dict(izip(fields, [getattr(self, f) for f in fields]))
        return print_fields(fields, fdict)
Пример #38
0
    def __new__(cls, *seq):
        if len(seq) != length:
            raise TypeError('Length mismatch')

        for i, j in izip(seq, cls._fields):
            if not isinstance(i, Number):
                raise TypeError(j + ' is not a Number')

        return baseClass.__new__(cls, *seq)
Пример #39
0
def _yield_text_from_framed_data(framed_data, parse=lambda x: x):
    parts = [parse(x) for x in framed_data.split(BOUNDARY)]
    for text_length, text in izip(parts[1::2], parts[2::2]):
        if text_length != str(len(text)):
            warning = 'invalid declared length=%s for packet_text=%s' % (
                text_length, text)
            _log.warn('[packet error] %s', warning)
            continue
        yield text
Пример #40
0
    def calc(self, p, x, xhi=None, *args, **kwargs):
        pha = self.pha

        user_grid = False
        try:

            if self._check_for_user_grid(x, xhi):
                user_grid = True
                self._startup_user_grid(x, xhi)

            # Slow
            if self.table is None:
                # again, fit() never comes in here b/c it calls startup()
                src = self.source
                vals = []
                for model, args in izip(self.models, self.grid):
                    elo, ehi = lo, hi = args
                    if pha.units == 'wavelength':
                        lo = DataPHA._hc / ehi
                        hi = DataPHA._hc / elo
                    vals.append(model(src(lo, hi)))
                self.orders = vals
            # Fast
            else:
                xlo, xhi = self.elo, self.ehi
                if pha.units == 'wavelength':
                    xlo, xhi = self.lo, self.hi

                src = self.source(xlo, xhi)  # hi-res grid of all ARF grids

                # Fold summed intervals through the associated response.
                self.orders = \
                    [model(sum_intervals(src, interval[0], interval[1]))
                     for model, interval in izip(self.models, self.table)]

            vals = sum(self.orders)
            if self.mask is not None:
                vals = vals[self.mask]

        finally:
            if user_grid:
                self._teardown_user_grid()

        return vals
Пример #41
0
    def loadBulk(self, oids):
        """
        Storage API to return multiple objects
        We load a unique set of them, just in case

        :param list oids: Iterable oids to load at once
        :return: Loaded oid objects
        :rtype: list
        """
        # First, try to get whatever possible from cache
        self._load_lock.acquire()
        try:
            self._lock.acquire()  # for atomic processing of invalidations
            try:
                result = []
                for oid in oids:
                    out = self._cache.load(oid)
                    if not out:
                        self._load_oids[oid] = 1
                    else:
                        result.append(out)
            finally:
                self._lock.release()
            if len(self._load_oids) == 0:
                return result
            # If we ever get here, we need to load some more stuff
            # self._load_oids dictionary is protected by self._load_lock

            if self._server is None:
                raise ClientDisconnected()

            load_oids = list(self._load_oids.keys())

            # [(data, tid), (data, tid), ...]
            bulk_data = self._server.rpc.call("loadBulk", load_oids)

            data_size = 0
            for oid, (data, tid) in izip(load_oids, bulk_data):
                data_size += len(data)
                self._lock.acquire()  # for atomic processing of invalidations
                try:
                    if self._load_oids[
                            oid]:  # Update cache only when there was no invalidation
                        self._cache.store(oid, tid, None, data)
                    del self._load_oids[oid]
                    result.append(
                        (data, tid)
                    )  # XXX shouldn't we provide a recent value from cache then?
                finally:
                    self._lock.release()
            logging.debug("Bulk-loaded {0} objects of size {1}".format(
                len(load_oids), data_size))
        finally:
            self._load_lock.release()

        return result
Пример #42
0
def _histogram(xlo, xhi, y, yerr=None, title=None, xlabel=None, ylabel=None,
               overplot=False, clearwindow=True,
               yerrorbars=False,
               errstyle=None,
               errcolor=None,
               errthickness=None,
               fillcolor=None,
               fillopacity=None,
               fillstyle=None,
               xlog=False,
               ylog=False,
               linestyle=chips.chips_solid,
               linecolor=None,
               linethickness=None,
               symbolangle=None,
               symbolcolor=None,
               symbolfill=None,
               symbolsize=None,
               symbolstyle=chips.chips_none):

    if (not overplot) and clearwindow:
        _clear_window()

    if yerrorbars and yerr is not None:
        chips.add_histogram(xlo, xhi, y, yerr)
    else:
        chips.add_histogram(xlo, xhi, y)

    for var in ('errstyle', 'errcolor', 'errthickness',
                'fillcolor', 'fillopacity', 'fillstyle',
                'linestyle', 'linecolor', 'linethickness',
                'symbolangle', 'symbolcolor', 'symbolfill', 'symbolsize',
                'symbolstyle'):
        val = locals()[var]
        if val is not None:
            if 'color' in var:
                val = _check_hex_color(val)
            getattr(chips.advanced, 'set_histogram_' + var)(val)

    if not overplot:
        for log_axis, axis_id in izip((xlog, ylog),
                                      (chips.X_AXIS, chips.Y_AXIS)):
            if log_axis:
                chips.log_scale(axis_id)
            else:
                chips.linear_scale(axis_id)

        if title:
            ttl = title.replace('_', '\\_')
            chips.set_plot_title(ttl)
        if xlabel:
            xlbl = xlabel.replace('_', '\\_')
            chips.set_plot_xlabel(xlbl)
        if ylabel:
            ylbl = ylabel.replace('_', '\\_')
            chips.set_plot_ylabel(ylbl)
Пример #43
0
def bigrams(seq):
    """
    Yields bigrams from the given sequence.

    >>> list(bigrams(range(4)))
    [(0, 1), (1, 2), (2, 3)]
    """
    first, second = tee(seq, 2)
    second = islice(second, 1, None)
    return izip(first, second)
Пример #44
0
    def compactify(self):
        """Assign new word ids to all words, shrinking gaps."""

        # build mapping from old id -> new id
        idmap = dict(izip(sorted(self.token2id.values()), xrange(len(self.token2id))))

        # reassign mappings to new ids
        self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()}
        self.id2token = {}
        self.word_freq = {idmap[tokenid]: freq for tokenid, freq in self.word_freq.items()}
Пример #45
0
def bigrams(seq):
    """
    Yields bigrams from the given sequence.

    >>> list(bigrams(range(4)))
    [(0, 1), (1, 2), (2, 3)]
    """
    first, second = tee(seq, 2)
    second = islice(second, 1, None)
    return izip(first, second)
Пример #46
0
def _contour(x0, x1, y, levels=None, title=None, xlabel=None, ylabel=None,
             overcontour=False, clearwindow=True,
             xlog=False,
             ylog=False,
             style=None,
             color=None,
             thickness=None,
             axis_pad=0.05):

    if (not overcontour) and clearwindow:
        _clear_window()

    # Catch NANs before sending to ChIPS
    bad = list(numpy.where(numpy.isnan(y)==True)).pop(0)
    bad_vals = numpy.array(y[bad])
    y[bad] = 0.0

    if levels is None:
        chips.add_contour(x0, x1, y)
    else:
        levels = numpy.asarray(levels, numpy.float_)
        chips.add_contour(x0, x1, y, levels)

    y[bad] = bad_vals

    for var in ('style', 'color', 'thickness'):
        val = locals()[var]
        if val is not None:
            if 'color' in var:
                val = _check_hex_color(val)
            getattr(chips.advanced, 'set_contour_' + var)(val)

    chips.advanced.set_axis_pad(axis_pad)

    chips.set_data_aspect_ratio()
    chips.limits(chips.X_AXIS, x0.min(), x0.max())
    chips.limits(chips.Y_AXIS, x1.min(), x1.max())

    if not overcontour:
        for log_axis, axis_id in izip((xlog, ylog),
                                      (chips.X_AXIS, chips.Y_AXIS)):
            if log_axis:
                chips.log_scale(axis_id)
            else:
                chips.linear_scale(axis_id)

        if title:
            ttl = title.replace('_', '\\_')
            chips.set_plot_title(ttl)
        if xlabel:
            xlbl = xlabel.replace('_', '\\_')
            chips.set_plot_xlabel(xlbl)
        if ylabel:
            ylbl = ylabel.replace('_', '\\_')
            chips.set_plot_ylabel(ylbl)
Пример #47
0
    def __iter__(self):
        """

        Yields
        ------
        list of (int, float)
            Document in BoW format.

        """
        for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]):
            yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
Пример #48
0
    def compactify(self):
        """Assign new word ids to all words, shrinking gaps."""
        logger.debug("rebuilding dictionary, shrinking gaps")

        # build mapping from old id -> new id
        idmap = dict(izip(sorted(itervalues(self.token2id)), xrange(len(self.token2id))))

        # reassign mappings to new ids
        self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)}
        self.id2token = {}
        self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
Пример #49
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard()  # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.norm

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result should be a full array or only num_best most similar
        # documents.
        pool, shard_results = self.query_shards(query)
        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            result = numpy.hstack(shard_results)
        else:
            # the following uses a lot of lazy evaluation and (optionally) parallel
            # processing, to improve query latency and minimize memory footprint.
            offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
            convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim)
                                             for doc_index, sim in doc]
            is_corpus, query = utils.is_corpus(query)
            is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
            if not is_corpus:
                # user asked for num_best most similar and query is a single doc
                results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
                result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
            else:
                # the trickiest combination: returning num_best results when query was a corpus
                results = []
                for shard_no, result in enumerate(shard_results):
                    shard_result = [convert(doc, shard_no) for doc in result]
                    results.append(shard_result)
                result = []
                for parts in izip(*results):
                    merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1])
                    result.append(merged)
        if pool:
            # gc doesn't seem to collect the Pools, eventually leading to
            # "IOError 24: too many open files". so let's terminate it manually.
            pool.terminate()

        return result
Пример #50
0
    def loadBulk(self, oids):
        """
        Storage API to return multiple objects
        We load a unique set of them, just in case

        :param list oids: Iterable oids to load at once
        :return: Loaded oid objects
        :rtype: list
        """
        # First, try to get whatever possible from cache
        self._load_lock.acquire()
        try:
            self._lock.acquire()    # for atomic processing of invalidations
            try:
                result = []
                for oid in oids:
                    out = self._cache.load(oid)
                    if not out:
                        self._load_oids[oid] = 1
                    else:
                        result.append(out)
            finally:
                self._lock.release()
            if len(self._load_oids) == 0:
                return result
            # If we ever get here, we need to load some more stuff
            # self._load_oids dictionary is protected by self._load_lock

            if self._server is None:
                raise ClientDisconnected()

            load_oids = list(self._load_oids.keys())

            # [(data, tid), (data, tid), ...]
            bulk_data = self._server.rpc.call("loadBulk", load_oids)

            data_size = 0
            for oid, (data, tid) in izip(load_oids, bulk_data):
                data_size += len(data)
                self._lock.acquire()    # for atomic processing of invalidations
                try:
                    if self._load_oids[oid]:  # Update cache only when there was no invalidation
                        self._cache.store(oid, tid, None, data)
                    del self._load_oids[oid]
                    result.append((data, tid))  # XXX shouldn't we provide a recent value from cache then?
                finally:
                    self._lock.release()
            logging.debug("Bulk-loaded {0} objects of size {1}".format(len(load_oids), data_size))
        finally:
            self._load_lock.release()

        return result
Пример #51
0
def pbkdf2_bin(data, salt, iterations=DEFAULT_PBKDF2_ITERATIONS,
               keylen=None, hashfunc=None):
    """Returns a binary digest for the PBKDF2 hash algorithm of `data`
    with the given `salt`. It iterates `iterations` times and produces a
    key of `keylen` bytes. By default, SHA-1 is used as hash function;
    a different hashlib `hashfunc` can be provided.

    .. versionadded:: 0.9

    :param data: the data to derive.
    :param salt: the salt for the derivation.
    :param iterations: the number of iterations.
    :param keylen: the length of the resulting key.  If not provided
                   the digest size will be used.
    :param hashfunc: the hash function to use.  This can either be the
                     string name of a known hash function or a function
                     from the hashlib module.  Defaults to sha1.
    """
    if isinstance(hashfunc, string_types):
        hashfunc = _hash_funcs[hashfunc]
    elif not hashfunc:
        hashfunc = hashlib.sha1
    data = to_bytes(data)
    salt = to_bytes(salt)

    # If we're on Python with pbkdf2_hmac we can try to use it for
    # compatible digests.
    if _has_native_pbkdf2:
        _test_hash = hashfunc()
        if hasattr(_test_hash, 'name') and \
           _test_hash.name in _hash_funcs:
            return hashlib.pbkdf2_hmac(_test_hash.name,
                                       data, salt, iterations,
                                       keylen)

    mac = hmac.HMAC(data, None, hashfunc)
    if not keylen:
        keylen = mac.digest_size

    def _pseudorandom(x, mac=mac):
        h = mac.copy()
        h.update(x)
        return bytearray(h.digest())
    buf = bytearray()
    for block in range(1, -(-keylen // mac.digest_size) + 1):
        rv = u = _pseudorandom(salt + _pack_int(block))
        for i in range_type(iterations - 1):
            u = _pseudorandom(bytes(u))
            rv = bytearray(starmap(xor, izip(rv, u)))
        buf.extend(rv)
    return bytes(buf[:keylen])
Пример #52
0
def parallel_est(estfunc, limit_parnums, pars, numcores=_ncpus):

    tasks = []

    def worker(out_q, err_q, parids, parnums, parvals, lock):
        results = []
        for parid, singleparnum in izip(parids, parnums):
            try:
                result = estfunc(parid, singleparnum, lock)
                results.append((parid, result))
            except EstNewMin:
                # catch the EstNewMin exception and include the exception
                # class and the modified parameter values to the error queue.
                # These modified parvals determine the new lower statistic.
                # The exception class will be instaniated re-raised with the
                # parameter values attached.  C++ Python exceptions are not
                # picklable for use in the queue.
                err_q.put(EstNewMin(parvals))
                return
            except Exception as e:
                #err_q.put( e.__class__() )
                err_q.put(e)
                return

        out_q.put(results)

    # The multiprocessing manager provides references to process-safe
    # shared objects like Queue and Lock
    manager = multiprocessing.Manager()
    out_q = manager.Queue()
    err_q = manager.Queue()
    lock = manager.Lock()

    size = len(limit_parnums)
    parids = numpy.arange(size)

    # if len(limit_parnums) is less than numcores, only use length number of
    # processes
    if size < numcores:
        numcores = size

    # group limit_parnums into numcores-worth of chunks
    limit_parnums = numpy.array_split(limit_parnums, numcores)
    parids = numpy.array_split(parids, numcores)

    tasks = [multiprocessing.Process(target=worker,
                                     args=(out_q, err_q, parid, parnum, pars, lock))
             for parid, parnum in izip(parids, limit_parnums)]

    return run_tasks(tasks, out_q, err_q, size)
Пример #53
0
    def query(self, queryobj=None, skip=None, limit=None, prefetch=True, **kw):
        """
        Smart proxy to catalog's query.
        One can add <field=...> keyword arguments to make queries where fields
        are equal to specified values

        :param zerodb.catalog.query.Query queryobj: Query which all sorts of
            logical, range queries etc
        :param int skip: Offset to start the result iteration from
        :param int limit: Limit number of results to this
        """
        # Catalog's query returns only integers
        # We must be smart here and return objects
        # But no, we must be even smarter and batch-preload objects
        # Most difficult part is preloading TreeSets for index when needed
        # (when we do complex queries which require composite index)
        # We also probably should do something like lazy query(...)[ofs:...]
        # if no limit, skip are used

        # Work needed on skip and limit because zope didn't well support them...
        skip = skip or 0
        if limit:
            kw["limit"] = skip + limit

        eq_args = []
        for k in list(kw.keys()):
            if k not in set(["sort_index", "sort_type", "reverse", "names", "limit"]):
                eq_args.append(Eq(k, kw.pop(k)))

        if queryobj:
            Q = optimize(optimize(queryobj) & And(*eq_args))
        else:
            Q = And(*eq_args)

        q = lambda: self._catalog.query(Q, **kw)

        if limit:
            _, q = q()
            # XXX islice -> [:]
            qids = list(itertools.islice(q, skip, skip + limit))
            objects = [self._objects[uid] for uid in qids]
            if objects and prefetch:
                self._db._connection.prefetch(objects)
            for obj, uid in izip(objects, qids):
                obj._p_uid = uid
            return objects

        else:
            db_list = DBListPrefetch if prefetch else DBList
            return db_list(q, self)