def enrichEntity(self, entity, decorations, max_iterations=None, timestamp=None): """ (might be named enrichedEntityWithSources) enrichEntity takes a entity schema object (defined in api/Schemas.py), an output dict of decorations that is opaque to this class - only group objects and sources have an understanding of the decorations format the group method syncDecorations() handles all propagation of source local decorations to the output decoration dict returns a bool value indicating whether the entity was enriched """ self.setNow(timestamp) max_iterations = max_iterations or self.__default_max_iterations modified_total = False logs.debug("Begin enrichment: %s (%s)" % (entity.title, entity.entity_id)) # We will loop through all sources multiple times, because as data is enriched, previous unresolvable sources # may become resolvable and can enrich in turn. If no fields are modified by any source in a given iteration, # then there's no reason to loop again for i in range(max_iterations): modified = False for source in self.__sources: if entity.kind not in source.kinds: continue if entity.types and source.types and not set(entity.types).intersection(source.types): continue groups = source.getGroups(entity) targetGroups = set() for group in groups: if self.shouldEnrich(group, source.sourceName, entity): targetGroups.add(group) if not targetGroups: continue # We have groups that are eligible for enrichment. We'll modify a deep-copy of the entity copy = buildEntity(entity.dataExport()) # timestamps is passed down to the source. If the source enriches a group, a mapping is added from the # group name to the time it was enriched (now, essentially). When the data we get from external source # is identical to what we already have, presence of the group in this map is the only way we can tell # that we received fresh data. # TODO: This is a dictionary for legacy reasons, it should really be a set. timestamps = {} localDecorations = {} # opaque decorations, for group object based extensions (i.e. Menus) logs.debug("Enriching with '%s' for groups %s" % (source.sourceName, sorted(targetGroups))) groupObjs = [self.getGroup(group) for group in targetGroups] try: enriched = source.enrichEntity(copy, groupObjs, self, localDecorations, timestamps) if enriched: for groupObj in groupObjs: fieldsChanged = groupObj.syncFields(copy, entity) decorationsChanged = groupObj.syncDecorations(localDecorations, decorations) if fieldsChanged or groupObj.groupName in timestamps or decorationsChanged: groupObj.setTimestamp(entity, self.now) groupObj.setSource(entity, source.sourceName) modified = True except Exception as e: report() if not modified: break modified_total |= modified return modified_total
def _convertFromMongo(self, document): if document is None: return None if 'search_blurb' in document: del(document['search_blurb']) document = self._upgradeDocument(document) if '_id' in document and self._primary_key is not None: document[self._primary_key] = self._getStringFromObjectId(document['_id']) del(document['_id']) entityData = document.pop('entity') document['entity'] = {'entity_id': entityData['entity_id']} stamp = self._obj().dataImport(document, overflow=self._overflow) try: entity = buildEntity(entityData, mini=True) stamp.entity = entity except Exception as e: logs.warning("Unable to upgrade entity embedded within stamp '%s'" % (stamp.stamp_id)) return stamp
def _convertFromMongo(self, document): if document is None: return None if '_id' in document and self._primary_key is not None: document[self._primary_key] = self._getStringFromObjectId(document['_id']) del(document['_id']) document.pop('titlel') entity = buildEntity(document) return entity
def _convertFromMongo(self, document): """ Keep in mind this is returning a RawTodo, which is less-enriched than a Todo """ if document is None: return None if "_id" in document and self._primary_key is not None: document[self._primary_key] = self._getStringFromObjectId(document["_id"]) del (document["_id"]) entityData = document.pop("entity") entity = buildEntity(entityData, mini=True) document["entity"] = {"entity_id": entity.entity_id} stampData = document.pop("stamp", None) if stampData is not None: document["source_stamp_ids"] = [stampData["stamp_id"]] rawtodo = self._obj().dataImport(document, overflow=self._overflow) rawtodo.entity = entity return rawtodo
def _convertFromMongo(self, document, mini=False): if document is None: return None if '_id' in document and self._primary_key is not None: document[self._primary_key] = self._getStringFromObjectId(document['_id']) del(document['_id']) ### HACK: Verify that 'created' timestamp exists for entity if 'timestamp' not in document or 'created' not in document['timestamp']: try: created = ObjectId(document[self._primary_key]).generation_time.replace(tzinfo=None) except: report() raise document['timestamp'] = { 'created' : created } document.pop('titlel', None) document.pop('search_tokens', None) entity = buildEntity(document, mini=mini) return entity
def checkIntegrity(self, key, repair=False, api=None): document = self._getMongoDocumentFromId(key) assert document is not None modified = False # Check if old schema version if 'contents' not in document or 'credit' in document or 'search_blurb' not in document: msg = "%s: Old schema" % key if repair: logs.info(msg) modified = True else: raise StampedDataError(msg) stamp = self._convertFromMongo(document) # Verify that user exists userId = stamp.user.user_id if self._collection._database['users'].find({'_id': self._getObjectIdFromString(userId)}).count() == 0: msg = "%s: User not found (%s)" % (key, userId) raise StampedDataError(msg) # Verify that any credited users exist if stamp.credits is not None: credits = [] for credit in stamp.credits: creditedUserId = credit.user.user_id query = {'_id' : self._getObjectIdFromString(creditedUserId)} if self._collection._database['users'].find(query).count() == 1: credits.append(credit) else: msg = "%s: Credited user not found (%s)" % (key, creditedUserId) if repair: logs.info(msg) modified = True else: raise StampedDataError(msg) if len(credits) > 0: stamp.credits = credits else: msg = "%s: Cleaning up credits" % key logs.info(msg) if repair: del(stamp.credits) modified = True # Verify that entity exists entityId = stamp.entity.entity_id entityDocument = self._collection._database['entities'].find_one({'_id' : self._getObjectIdFromString(entityId)}) if entityDocument is None: msg = "%s: Entity not found (%s)" % (key, entityId) raise StampedDataError(msg) entity = buildEntity(entityDocument) # Check if entity has been tombstoned and update entity if so if entity.sources.tombstone_id is not None: msg = "%s: Entity tombstoned to new entity" % (key) if repair: logs.info(msg) tombstoneId = entity.sources.tombstone_id tombstone = self._collection._database['entities'].find_one({'_id' : self._getObjectIdFromString(tombstoneId)}) if tombstone is None: msg = "%s: New tombstone entity not found (%s)" % (key, tombstoneId) raise StampedDataError(msg) stamp.entity = buildEntity(tombstone).minimize() modified = True else: raise StampedDataError(msg) # Check if entity stub has been updated else: if stamp.entity != entity.minimize(): msg = "%s: Embedded entity is stale" % key if repair: logs.info(msg) stamp.entity = entity.minimize() modified = True else: raise StampedDataError(msg) # Verify that stamp number is unique stampNum = stamp.stats.stamp_num duplicateStamps = self._collection.find({'user.user_id' : userId, 'stats.stamp_num' : stampNum}) if duplicateStamps.count() > 1: msg = "%s: Multiple stamps exist for userId '%s' and stampNum '%s'" % (key, userId, stampNum) raise StampedDataError(msg) # Verify that this is the only stamp for this user for this entity if self._collection.find({'user.user_id': userId, 'entity.entity_id': stamp.entity.entity_id}).count() > 1: msg = "%s: Multiple stamps exist for user '%s' and entity '%s'" % (key, userId, stamp.entity.entity_id) raise StampedDataError(msg) ### TODO # Check if temp_image_url exists -> kick off async process # Check that image[s] have dimensions # Verify image url exists? # Check if stats need to be updated? if modified and repair: self._collection.update({'_id' : key}, self._convertToMongo(stamp)) # Check integrity for stats self.stamp_stats.checkIntegrity(key, repair=repair, api=api) return True
def checkIntegrity(self, key, repair=False, api=None): """ Check the raw todo to verify the following things: - Todo has the proper structure (updated schema) - Linked user exists - Linked entity exists and is not tombstoned - Entity mini matches linked entity - If associated with a stamp, verify that the stamp still exists - Check if it's been stamped """ document = self._getMongoDocumentFromId(key) assert document is not None modified = False # Check if old schema version if "stamp" in document: msg = "%s: Old schema" % key if repair: logs.info(msg) modified = True else: raise StampedDataError(msg) todo = self._convertFromMongo(document) # Verify that user exists userId = todo.user_id if self._collection._database["users"].find({"_id": self._getObjectIdFromString(userId)}).count() == 0: msg = "%s: User not found (%s)" % (key, userId) raise StampedDataError(msg) # Verify that entity exists entityId = todo.entity.entity_id entityDocument = self._collection._database["entities"].find_one({"_id": self._getObjectIdFromString(entityId)}) if entityDocument is None: msg = "%s: Entity not found (%s)" % (key, entityId) raise StampedDataError(msg) entity = buildEntity(entityDocument) # Check if entity has been tombstoned and update entity if so if entity.sources.tombstone_id is not None: msg = "%s: Entity tombstoned to new entity" % (key) if repair: logs.info(msg) tombstoneId = entity.sources.tombstone_id tombstone = self._collection._database["entities"].find_one( {"_id": self._getObjectIdFromString(tombstoneId)} ) if tombstone is None: msg = "%s: New tombstone entity not found (%s)" % (key, tombstoneId) raise StampedDataError(msg) todo.entity = buildEntity(tombstone).minimize() modified = True else: raise StampedDataError(msg) # Check if entity stub has been updated else: if todo.entity != entity.minimize(): msg = "%s: Embedded entity is stale" % key if repair: logs.info(msg) todo.entity = entity.minimize() modified = True else: raise StampedDataError(msg) # Check if source stamps are still valid if todo.source_stamp_ids is not None: stampIds = [] for stampId in todo.source_stamp_ids: query = {"_id": self._getObjectIdFromString(stampId)} if self._collection._database["stamps"].find(query).count() == 1: stampIds.append(stampId) else: msg = "%s: Sourced stamp not found (%s)" % (key, stampId) if repair: logs.info(msg) modified = True else: raise StampedDataError(msg) if len(stampIds) > 0: todo.source_stamp_ids = stampIds else: msg = "%s: Cleaning up source stamp ids" % key logs.info(msg) if repair: del (todo.source_stamp_ids) modified = True # Check if todo has been stamped and verify only one possible todo exists query = {"user.user_id": todo.user_id, "entity.entity_id": todo.entity.entity_id} stamps = self._collection._database["stamps"].find(query, fields=["_id"]) stampIds = map(lambda x: str(x["_id"]), stamps) if len(stampIds) == 1: if todo.stamp_id is None or todo.stamp_id != stampIds[0]: msg = "%s: Replacing stamp id" % key if repair: logs.info(msg) todo.stamp_id = stampIds[0] modified = True else: raise StampedDataError(msg) elif len(stampIds) > 1: msg = "%s: Multiple stamps exist for user '%s' and entity '%s'" % (key, todo.user_id, todo.entity.entity_id) raise StampedDataError(msg) if modified and repair: self._collection.update({"_id": key}, self._convertToMongo(todo)) return True
def __init__(self, entity): self.__entity = buildEntity(entity.dataExport())