def prepare_objects(self, _cube, objects, mtime): ''' :param dict obj: dictionary that will be converted to mongodb doc :param int mtime: timestamp to apply as _start for objects Do some basic object validatation and add an _start timestamp value ''' new_obj_hashes = [] for obj in objects: _start = obj.pop('_start') if '_start' in obj else None _end = obj.pop('_end') if '_end' in obj else None if _end is not None and _start is None: self._raise(400, "objects with _end must have _start") if not _start: _start = mtime if not isinstance(_start, (int, float)): self._raise(400, "_start must be float/int") if not isinstance(_end, (int, float)) and _end is not None: self._raise(400, "_end must be float/int/None") if '_id' in obj: self._raise(400, "_id field CAN NOT be defined: %s" % obj) if '_hash' in obj: self._raise(400, "_hash field CAN NOT be defined: %s" % obj) if '_oid' not in obj: self._raise(400, "_oid field MUST be defined: %s" % obj) # hash the object (minus _start/_end) _hash = jsonhash(obj) obj['_hash'] = _hash if _end is None: new_obj_hashes.append(_hash) # add back _start and _end properties obj['_start'] = _start obj['_end'] = _end # we want to avoid serializing in and out later obj['_id'] = str(ObjectId()) # FIXME: refactor this so we split the _hashes # mongodb lookups iterate across 16M max # spec docs... # get the estimate size, as follows #est_size_hashes = estimate_obj_size(_hashes) # Filter out objects whose most recent version did not change docs = _cube.find({'_hash': {'$in': new_obj_hashes}, '_end': None}, fields={'_hash': 1, '_id': -1}) _dup_hashes = set([doc['_hash'] for doc in docs]) objects = [obj for obj in objects if obj['_hash'] not in _dup_hashes] objects = filter(None, objects) return objects
def test_jsonhash(): from metriqued.utils import jsonhash dct = {'a': [3, 2, 1], 'z': ['a', 'c', 'b', 1], 'b': {1: [], 3: {}}} dct_sorted_z = copy(dct) dct_sorted_z['z'] = sorted(dct_sorted_z['z']) dct_diff = copy(dct) del dct_diff['z'] DCT = '541d0fa961265d976d9a27e8632787875dc58406' DCT_SORTED_Z = 'ca4631674276933bd251bd4bc86372138a841a4b' DCT_DIFF = '07d6c518867fb6b6c77c0ec1d835fb800419fc24' assert dct != dct_sorted_z assert jsonhash(dct) == DCT assert jsonhash(dct_sorted_z) == DCT_SORTED_Z assert jsonhash(dct_diff) == DCT_DIFF ' list sort order is an identifier of a unique object ' assert jsonhash(dct) != jsonhash(dct_sorted_z)