def createOrUpdateBranches(self, importBranchesEncodedPayload: bytes) -> None: """ Convert Import Branch Tuples This method takes import branch tuples, and converts them to branch format used throughout the diagram plugin. (Thats the packed JSON wrapped by an accessor class) """ # Decode importBranches payload importBranches: List[ImportBranchTuple] = ( Payload().fromEncodedPayload(importBranchesEncodedPayload).tuples) # Validate the input importBranches _validateNewBranchIndexs(importBranches) # Do the import groupedBranches = _convertImportBranchTuples(importBranches) startTime = datetime.now(pytz.utc) dbSession = CeleryDbConn.getDbSession() engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: for (modelSetKey, modelSetId, coordSetId), branches in groupedBranches.items(): _insertOrUpdateBranches(conn, modelSetKey, modelSetId, branches) newDisps, dispIdsToCompile = _convertBranchDisps(branches) # NO TRANSACTION # Bulk load the Disps _bulkInsertDisps(engine, newDisps) # Queue the compiler DispCompilerQueueController.queueDispIdsToCompileWithSession( dispIdsToCompile, conn) transaction.commit() dbSession.commit() logger.debug( "Completed importing %s branches for coordSetId %s in %s", len(branches), coordSetId, (datetime.now(pytz.utc) - startTime)) except Exception as e: dbSession.rollback() transaction.rollback() logger.debug("Retrying createOrUpdateBranches, %s", e) logger.exception(e) raise self.retry(exc=e, countdown=3) finally: dbSession.close() conn.close()
def deleteTraceConfig(self, modelSetKey: str, traceConfigKeys: List[str]) -> None: startTime = datetime.now(pytz.utc) traceConfigTable = GraphDbTraceConfig.__table__ engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: modelSetIdByKey = _loadModelSets() modelSetId = modelSetIdByKey[modelSetKey] conn.execute( traceConfigTable.delete(and_(traceConfigTable.c.key.in_(traceConfigKeys), traceConfigTable.c.modelSetId == modelSetId)) ) transaction.commit() logger.info("Deleted %s trace configs in %s", len(traceConfigKeys), (datetime.now(pytz.utc) - startTime)) except Exception as e: transaction.rollback() logger.debug("Retrying import graphDb objects, %s", e) raise self.retry(exc=e, countdown=3) finally: conn.close()
def _loadModelSets() -> Dict[str, int]: # Get the model set engine = CeleryDbConn.getDbEngine() conn = engine.connect() try: results = list( conn.execute( select(columns=[_modelSetTable.c.id, _modelSetTable.c.key]))) modelSetIdByKey = {o.key: o.id for o in results} del results finally: conn.close() return modelSetIdByKey
def _loadCoordSets(modelSetId: int) -> Dict[str, int]: # Get the model set engine = CeleryDbConn.getDbEngine() conn = engine.connect() try: results = list( conn.execute( select(columns=[_coordSetTable.c.id, _coordSetTable.c.key], whereclause=_coordSetTable.c.modelSetId == modelSetId))) coordSetIdByKey = {o.key: o.id for o in results} del results finally: conn.close() return coordSetIdByKey
def _bulkLoadDispsTask(importGroupHash: str, disps: List): """ Import Disps Links 1) Drop all disps with matching importGroupHash 2) set the coordSetId :param importGroupHash: :param disps: An array of disp objects to import :return: """ dispTable = DispBase.__table__ gridKeyIndexTable = GridKeyIndex.__table__ gridQueueTable = GridKeyCompilerQueue.__table__ engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: stmt = select([gridKeyIndexTable.c.coordSetId, gridKeyIndexTable.c.gridKey]) \ .where(dispTable.c.importGroupHash == importGroupHash) \ .select_from(join(gridKeyIndexTable, dispTable, gridKeyIndexTable.c.dispId == dispTable.c.id)) \ .distinct() ins = gridQueueTable.insert().from_select(['coordSetId', 'gridKey'], stmt) conn.execute(ins) conn.execute(dispTable.delete().where( dispTable.c.importGroupHash == importGroupHash)) transaction.commit() _bulkInsertDisps(engine, disps) except Exception: transaction.rollback() raise finally: conn.close()
def deleteSegment(self, modelSetKey: str, importGroupHashes: List[str]) -> None: startTime = datetime.now(pytz.utc) segmentTable = GraphDbSegment.__table__ queueTable = GraphDbCompilerQueue.__table__ engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: modelSetIdByKey = _loadModelSets() modelSetId = modelSetIdByKey[modelSetKey] chunkKeys = conn.execute( select([segmentTable.c.modelSetId, segmentTable.c.chunkKey], and_(segmentTable.c.importGroupHash.in_(importGroupHashes), segmentTable.c.modelSetId == modelSetId))).fetchall() if chunkKeys: conn.execute( segmentTable.delete( and_(segmentTable.c.importGroupHash.in_(importGroupHashes), segmentTable.c.modelSetId == modelSetId))) conn.execute(queueTable.insert(), chunkKeys) deleteItemKeys(conn, modelSetId, importGroupHashes) transaction.commit() logger.info("Deleted %s, queued %s chunks in %s", len(importGroupHashes), len(chunkKeys), (datetime.now(pytz.utc) - startTime)) except Exception as e: transaction.rollback() logger.debug("Retrying graphDb deleteSegment, %s", e) raise self.retry(exc=e, countdown=3) finally: conn.close()
def compileBranchIndexChunk(self, payloadEncodedArgs: bytes) -> List[int]: """ Compile BranchIndex Index Task :param self: A bound parameter from celery :param payloadEncodedArgs: An encoded payload containing the queue tuples. :returns: A list of grid keys that have been updated. """ argData = Payload().fromEncodedPayload(payloadEncodedArgs).tuples queueItems = argData[0] queueItemIds: List[int] = argData[1] engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: queueItemsByModelSetId = defaultdict(list) for queueItem in queueItems: queueItemsByModelSetId[queueItem.modelSetId].append(queueItem) for modelSetId, modelSetQueueItems in queueItemsByModelSetId.items(): _compileBranchIndexChunk(conn, transaction, modelSetId, modelSetQueueItems) queueTable = BranchIndexCompilerQueue.__table__ transaction = conn.begin() conn.execute(queueTable.delete(queueTable.c.id.in_(queueItemIds))) transaction.commit() except Exception as e: transaction.rollback() logger.debug("RETRYING task - %s", e) raise self.retry(exc=e, countdown=10) finally: conn.close() return list(set([i.chunkKey for i in queueItems]))
def _insertToDb(dispIds, gridCompiledQueueItems, gridKeyIndexesByDispId, locationCompiledQueueItems, locationIndexByDispId, queueIds): """ Insert to DB This method provides the DB inserts and deletes after the data has been calculated. """ startTime = datetime.now(pytz.utc) dispBaseTable = DispBase.__table__ dispQueueTable = DispIndexerQueue.__table__ gridKeyIndexTable = GridKeyIndex.__table__ gridQueueTable = GridKeyCompilerQueue.__table__ locationIndexTable = LocationIndex.__table__ locationIndexCompilerQueueTable = LocationIndexCompilerQueue.__table__ engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: lockedDispIds = conn.execute( Select(whereclause=dispBaseTable.c.id.in_(dispIds), columns=[dispBaseTable.c.id], for_update=True)) lockedDispIds = [o[0] for o in lockedDispIds] # Ensure that the Disps exist, otherwise we get an integrity error. gridKeyIndexes = [] locationIndexes = [] for dispId in lockedDispIds: gridKeyIndexes.extend(gridKeyIndexesByDispId[dispId]) if dispId in locationIndexByDispId: locationIndexes.append(locationIndexByDispId[dispId]) # Delete existing items in the location and grid index # grid index conn.execute( gridKeyIndexTable.delete(gridKeyIndexTable.c.dispId.in_(dispIds))) # location index conn.execute( locationIndexTable.delete( locationIndexTable.c.dispId.in_(dispIds))) # --------------- # Insert the Grid Key indexes if gridKeyIndexes: conn.execute(gridKeyIndexTable.insert(), gridKeyIndexes) # Directly insert into the Grid compiler queue. if gridCompiledQueueItems: conn.execute(gridQueueTable.insert(), [ dict(coordSetId=i.coordSetId, gridKey=i.gridKey) for i in gridCompiledQueueItems ]) # --------------- # Insert the Location indexes if locationIndexes: conn.execute(locationIndexTable.insert(), locationIndexes) # Directly insert into the Location compiler queue. if locationCompiledQueueItems: conn.execute(locationIndexCompilerQueueTable.insert(), [ dict(modelSetId=i.modelSetId, indexBucket=i.indexBucket) for i in locationCompiledQueueItems ]) # --------------- # Finally, delete the disp queue items conn.execute(dispQueueTable.delete(dispQueueTable.c.id.in_(queueIds))) transaction.commit() logger.debug("Committed %s GridKeyIndex in %s", len(gridKeyIndexes), (datetime.now(pytz.utc) - startTime)) except Exception as e: raise finally: conn.close()
def _insertOrUpdateObjects(newDocuments: List[ImportDocumentTuple], modelSetId: int, docTypeIdsByName: Dict[str, int]) -> None: """ Insert or Update Objects 1) Find objects and update them 2) Insert object if the are missing """ documentTable = DocDbDocument.__table__ queueTable = DocDbCompilerQueue.__table__ startTime = datetime.now(pytz.utc) engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: dontDeleteObjectIds = [] objectIdByKey: Dict[str, int] = {} objectKeys = [o.key for o in newDocuments] chunkKeysForQueue: Set[Tuple[str, str]] = set() # Query existing objects results = list( conn.execute( select(columns=[ documentTable.c.id, documentTable.c.key, documentTable.c.chunkKey, documentTable.c.documentJson ], whereclause=and_( documentTable.c.key.in_(objectKeys), documentTable.c.modelSetId == modelSetId)))) foundObjectByKey = {o.key: o for o in results} del results # Get the IDs that we need newIdGen = CeleryDbConn.prefetchDeclarativeIds( DocDbDocument, len(newDocuments) - len(foundObjectByKey)) # Create state arrays inserts = [] updates = [] processedKeys = set() # Work out which objects have been updated or need inserting for importDocument in newDocuments: if importDocument.key in processedKeys: raise Exception("Key %s exists in import data twice" % importDocument.key) processedKeys.add(importDocument.key) existingObject = foundObjectByKey.get(importDocument.key) importDocumentTypeId = docTypeIdsByName[ importDocument.documentTypeKey] packedJsonDict = { k: v for k, v in importDocument.document.items() if v is not None and v is not '' } # 0 / false allowed packedJsonDict['_dtid'] = importDocumentTypeId packedJsonDict['_msid'] = modelSetId documentJson = json.dumps(packedJsonDict, sort_keys=True) # Work out if we need to update the object type if existingObject: updates.append( dict(b_id=existingObject.id, b_typeId=importDocumentTypeId, b_documentJson=documentJson)) dontDeleteObjectIds.append(existingObject.id) else: id_ = next(newIdGen) existingObject = DocDbDocument( id=id_, modelSetId=modelSetId, documentTypeId=importDocumentTypeId, key=importDocument.key, importGroupHash=importDocument.importGroupHash, chunkKey=makeChunkKey(importDocument.modelSetKey, importDocument.key), documentJson=documentJson) inserts.append(existingObject.tupleToSqlaBulkInsertDict()) objectIdByKey[existingObject.key] = existingObject.id chunkKeysForQueue.add((modelSetId, existingObject.chunkKey)) # Insert the DocDb Objects if inserts: conn.execute(documentTable.insert(), inserts) if updates: stmt = (documentTable.update().where( documentTable.c.id == bindparam('b_id')).values( documentTypeId=bindparam('b_typeId'), documentJson=bindparam('b_documentJson'))) conn.execute(stmt, updates) if chunkKeysForQueue: conn.execute( queueTable.insert(), [dict(modelSetId=m, chunkKey=c) for m, c in chunkKeysForQueue]) if inserts or updates or chunkKeysForQueue: transaction.commit() else: transaction.rollback() logger.debug("Inserted %s updated %s queued %s chunks in %s", len(inserts), len(updates), len(chunkKeysForQueue), (datetime.now(pytz.utc) - startTime)) except Exception: transaction.rollback() raise finally: conn.close()
def importLiveDbItems(self, modelSetKey: str, newItems: List[ImportLiveDbItemTuple]) -> List[str]: """ Compile Grids Task :param self: A celery reference to this task :param modelSetKey: The model set name :param newItems: The list of new items :returns: A list of grid keys that have been updated. """ startTime = datetime.now(pytz.utc) session = CeleryDbConn.getDbSession() engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() liveDbTable = LiveDbItem.__table__ try: liveDbModelSet = getOrCreateLiveDbModelSet(session, modelSetKey) # This will remove duplicates itemsByKey = {i.key: i for i in newItems} allKeys = list(itemsByKey) existingKeys = set() # Query for existing keys, in 1000 chinks chunkSize = 1000 offset = 0 while True: chunk = allKeys[offset:offset + chunkSize] if not chunk: break offset += chunkSize stmt = (select([liveDbTable.c.key]) .where(liveDbTable.c.modelSetId == liveDbModelSet.id) .where(makeCoreValuesSubqueryCondition( engine, liveDbTable.c.key, chunk )) ) result = conn.execute(stmt) existingKeys.update([o[0] for o in result.fetchall()]) inserts = [] newKeys = [] for newItem in itemsByKey.values(): if newItem.key in existingKeys: continue inserts.append(dict( modelSetId=liveDbModelSet.id, key=newItem.key, dataType=newItem.dataType, rawValue=newItem.rawValue, displayValue=newItem.displayValue, importHash=newItem.importHash )) newKeys.append(newItem.key) if not inserts: return [] conn.execute(LiveDbItem.__table__.insert(), inserts) transaction.commit() logger.info("Inserted %s LiveDbItems, %s already existed, in %s", len(inserts), len(existingKeys), (datetime.now(pytz.utc) - startTime)) return newKeys except Exception as e: transaction.rollback() logger.debug("Task failed, but it will retry. %s", e) raise self.retry(exc=e, countdown=10) finally: conn.close() session.close()
def compileSearchIndexChunk(self, payloadEncodedArgs: bytes) -> List[str]: """ Compile Search Index Task :param self: A celery reference to this task :param payloadEncodedArgs: An encoded payload containing the queue tuples. :returns: A list of grid keys that have been updated. """ argData = Payload().fromEncodedPayload(payloadEncodedArgs).tuples queueItems = argData[0] queueItemIds: List[int] = argData[1] chunkKeys = list(set([i.chunkKey for i in queueItems])) queueTable = SearchIndexCompilerQueue.__table__ compiledTable = EncodedSearchIndexChunk.__table__ lastUpdate = datetime.now(pytz.utc).isoformat() startTime = datetime.now(pytz.utc) engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: logger.debug("Staring compile of %s queueItems in %s", len(queueItems), (datetime.now(pytz.utc) - startTime)) # Get Model Sets total = 0 existingHashes = _loadExistingHashes(conn, chunkKeys) encKwPayloadByChunkKey = _buildIndex(conn, chunkKeys) chunksToDelete = [] inserts = [] for chunkKey, searchIndexChunkEncodedPayload in encKwPayloadByChunkKey.items( ): m = hashlib.sha256() m.update(searchIndexChunkEncodedPayload) encodedHash = b64encode(m.digest()).decode() # Compare the hash, AND delete the chunk key if chunkKey in existingHashes: # At this point we could decide to do an update instead, # but inserts are quicker if encodedHash == existingHashes.pop(chunkKey): continue chunksToDelete.append(chunkKey) inserts.append( dict(chunkKey=chunkKey, encodedData=searchIndexChunkEncodedPayload, encodedHash=encodedHash, lastUpdate=lastUpdate)) # Add any chnuks that we need to delete that we don't have new data for, here chunksToDelete.extend(list(existingHashes)) if chunksToDelete: # Delete the old chunks conn.execute( compiledTable.delete( compiledTable.c.chunkKey.in_(chunksToDelete))) if inserts: newIdGen = CeleryDbConn.prefetchDeclarativeIds( SearchIndex, len(inserts)) for insert in inserts: insert["id"] = next(newIdGen) transaction.commit() transaction = conn.begin() if inserts: conn.execute(compiledTable.insert(), inserts) logger.debug("Compiled %s SearchIndexes, %s missing, in %s", len(inserts), len(chunkKeys) - len(inserts), (datetime.now(pytz.utc) - startTime)) total += len(inserts) conn.execute(queueTable.delete(queueTable.c.id.in_(queueItemIds))) transaction.commit() logger.info("Compiled and Committed %s EncodedSearchIndexChunks in %s", total, (datetime.now(pytz.utc) - startTime)) return chunkKeys except Exception as e: transaction.rollback() # logger.warning(e) # Just a warning, it will retry logger.exception(e) raise self.retry(exc=e, countdown=10) finally: conn.close()
def updateBranches(self, modelSetId: int, branchEncodedPayload: bytes) -> None: """ Update Branch This method is called from the UI to update a single branch. It could be called from a server API as well. All the branches must be for the same model set. """ # Decode BranchTuples payload updatedBranches: List[BranchTuple] = ( Payload().fromEncodedPayload(branchEncodedPayload).tuples ) startTime = datetime.now(pytz.utc) queueTable = BranchIndexCompilerQueue.__table__ dispBaseTable = DispBase.__table__ gridKeyIndexTable = GridKeyIndex.__table__ gridKeyCompilerQueueTable = GridKeyCompilerQueue.__table__ branchesByCoordSetId: Dict[int, List[BranchTuple]] = defaultdict(list) chunkKeys: Set[str] = set() newBranchesToInsert = [] # Create a lookup of CoordSets by ID dbSession = CeleryDbConn.getDbSession() try: # Get the latest lookups modelSet = dbSession.query(ModelSet).filter(ModelSet.id == modelSetId).one() coordSetById = {i.id: i for i in dbSession.query(ModelCoordSet).all()} dbSession.expunge_all() # Update the branches # This will be a performance problem if lots of branches are updated, # however, on first writing this will just be used by the UI for updating # individual branches. for branch in updatedBranches: try: if str(branch.id).startswith("NEW_"): branch.id = None if branch.id is None: branchIndex = dbSession.query(BranchIndex) \ .filter(BranchIndex.coordSetId == branch.coordSetId) \ .filter(BranchIndex.key == branch.key) \ .one() else: branchIndex = dbSession.query(BranchIndex) \ .filter(BranchIndex.id == branch.id) \ .one() branch.id = branchIndex.id branchIndex.packedJson = branch.packJson() branchIndex.updatedDate = branch.updatedDate except NoResultFound: newBranchesToInsert.append(branch) branchesByCoordSetId[branch.coordSetId].append(branch) chunkKeys.add(makeChunkKeyForBranchIndex(modelSet.key, branch.key)) dbSession.commit() except Exception as e: dbSession.rollback() logger.debug("Retrying updateBranch, %s", e) logger.exception(e) raise self.retry(exc=e, countdown=3) finally: dbSession.close() dbSession = CeleryDbConn.getDbSession() try: if newBranchesToInsert: _insertOrUpdateBranches(dbSession, modelSet.key, modelSet.id, newBranchesToInsert) dbSession.commit() # Make an array of all branch IDs allBranchIds = [] for branches in branchesByCoordSetId.values(): allBranchIds.extend([b.id for b in branches]) # Find out all the existing grids effected by this branch. gridsToRecompile = dbSession.execute( select(distinct=True, columns=[gridKeyIndexTable.c.gridKey, gridKeyIndexTable.c.coordSetId], whereclause=dispBaseTable.c.branchId.in_(allBranchIds)) .select_from(gridKeyIndexTable.join(dispBaseTable)) ).fetchall() allNewDisps = [] allDispIdsToCompile = [] packedJsonUpdates = [] # Recompile the BranchGridIndexes for coordSetId, branches in branchesByCoordSetId.items(): coordSet = coordSetById[coordSetId] assert coordSet.modelSetId == modelSetId, "Branches not all from one model" newDisps, dispIdsToCompile = _convertBranchDisps(branches) allNewDisps.extend(newDisps) allDispIdsToCompile.extend(dispIdsToCompile) packedJsonUpdates.extend([ dict(b_id=b.id, b_packedJson=b.packJson()) for b in branches ]) dbSession.execute( dispBaseTable.delete(dispBaseTable.c.branchId.in_(allBranchIds)) ) dbSession.commit() # NO TRANSACTION # Bulk load the Disps _bulkInsertDisps(CeleryDbConn.getDbEngine(), allNewDisps) # Queue the compiler DispCompilerQueueController.queueDispIdsToCompileWithSession( allDispIdsToCompile, dbSession ) # Update the JSON again back into the grid index. stmt = BranchIndex.__table__.update(). \ where(BranchIndex.__table__.c.id == bindparam('b_id')) \ .values(packedJson=bindparam('b_packedJson')) dbSession.execute(stmt, packedJsonUpdates) # 3) Queue chunks for recompile dbSession.execute( queueTable.insert(), [dict(modelSetId=modelSetId, chunkKey=c) for c in chunkKeys] ) # 4) Queue chunks for if gridsToRecompile: dbSession.execute( gridKeyCompilerQueueTable.insert(), [dict(coordSetId=item.coordSetId, gridKey=item.gridKey) for item in gridsToRecompile] ) dbSession.commit() logger.debug("Updated %s BranchIndexes queued %s chunks in %s", len(updatedBranches), len(chunkKeys), (datetime.now(pytz.utc) - startTime)) except Exception as e: dbSession.rollback() logger.debug("Retrying updateBranch, %s", e) logger.exception(e) raise self.retry(exc=e, countdown=3) finally: dbSession.close()
def importDispLinks(coordSet: ModelCoordSet, importGroupHash: str, importDispLinks: List[ImportLiveDbDispLinkTuple] ) -> List[ImportLiveDbItemTuple]: """ Import Disps Links 1) Drop all disps with matching importGroupHash 2) set the coordSetId :param coordSet: :param importGroupHash: :param importDispLinks: An array of import LiveDB Disp Links to import :return: """ dispLinkTable = LiveDbDispLink.__table__ dispLinkIdIterator = prefetchDeclarativeIds(LiveDbDispLink, len(importDispLinks)) startTime = datetime.now(pytz.utc) ormSession = CeleryDbConn.getDbSession() try: ormSession.execute(dispLinkTable .delete() .where(dispLinkTable.c.importGroupHash == importGroupHash)) if not importDispLinks: return [] liveDbItemsToImportByKey = {} dispLinkInserts = [] for importDispLink in importDispLinks: dispLink = _convertImportDispLinkTuple(coordSet, importDispLink) dispLink.id = next(dispLinkIdIterator) liveDbItem = _makeImportLiveDbItem( importDispLink, liveDbItemsToImportByKey ) dispLink.liveDbKey = liveDbItem.key dispLinkInserts.append(dispLink.tupleToSqlaBulkInsertDict()) # if dispLinkInserts: # ormSession.execute(LiveDbDispLink.__table__.insert(), dispLinkInserts) ormSession.commit() if dispLinkInserts: # This commits it's self rawConn = CeleryDbConn.getDbEngine().raw_connection() pgCopyInsert(rawConn, LiveDbDispLink.__table__, dispLinkInserts) rawConn.commit() logger.info( "Inserted %s LiveDbDispLinks in %s", len(dispLinkInserts), (datetime.now(pytz.utc) - startTime) ) return list(liveDbItemsToImportByKey.values()) finally: ormSession.close()
def _insertOrUpdateObjects(newSegments: List[GraphDbImportSegmentTuple], modelSetId: int, modelSetKey: str) -> None: """ Insert or Update Objects 1) Find objects and update them 2) Insert object if the are missing """ segmentTable = GraphDbSegment.__table__ queueTable = GraphDbCompilerQueue.__table__ startTime = datetime.now(pytz.utc) importHashSet = set() chunkKeysForQueue: Set[Tuple[int, str]] = set() # Get the IDs that we need newIdGen = CeleryDbConn.prefetchDeclarativeIds(GraphDbSegment, len(newSegments)) # Create state arrays inserts = [] newItemKeys = [] # Work out which objects have been updated or need inserting for importSegment in newSegments: importHashSet.add(importSegment.importGroupHash) segmentJson = importSegment.packJson() id_ = next(newIdGen) existingObject = GraphDbSegment( id=id_, modelSetId=modelSetId, key=importSegment.key, importGroupHash=importSegment.importGroupHash, chunkKey=makeChunkKeyForSegmentKey(importSegment.modelSetKey, importSegment.key), segmentJson=segmentJson) inserts.append(existingObject.tupleToSqlaBulkInsertDict()) chunkKeysForQueue.add((modelSetId, existingObject.chunkKey)) for edge in importSegment.edges: newItemKeys.append( ItemKeyImportTuple( importGroupHash=importSegment.importGroupHash, itemKey=edge.key, itemType=ItemKeyTuple.ITEM_TYPE_EDGE, segmentKey=importSegment.key)) for vertex in importSegment.vertexes: newItemKeys.append( ItemKeyImportTuple( importGroupHash=importSegment.importGroupHash, itemKey=vertex.key, itemType=ItemKeyTuple.ITEM_TYPE_VERTEX, segmentKey=importSegment.key)) # TODO: If this fails, we could potentially delete by segment key. # But that seems a bit hackish, the agents should delete the old first. # Or should they. That might leave a temporary gap in the network. # Delete old stuff if importHashSet: deleteSegment(modelSetKey=modelSetKey, importGroupHashes=list(importHashSet)) engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: # Insert the GraphDb Objects if inserts: conn.execute(segmentTable.insert(), inserts) if chunkKeysForQueue: conn.execute( queueTable.insert(), [dict(modelSetId=m, chunkKey=c) for m, c in chunkKeysForQueue]) loadItemKeys(conn, newItemKeys, modelSetId, modelSetKey) if inserts or chunkKeysForQueue or newItemKeys: transaction.commit() else: transaction.rollback() logger.info("Inserted %s queued %s chunks in %s", len(inserts), len(chunkKeysForQueue), (datetime.now(pytz.utc) - startTime)) except Exception: transaction.rollback() raise finally: conn.close()
def compileGrids(self, payloadEncodedArgs: bytes) -> List[str]: """ Compile Grids Task :param self: A celery reference to this task :param payloadEncodedArgs: An encoded payload containing the queue tuples. :returns: A list of grid keys that have been updated. """ argData = Payload().fromEncodedPayload(payloadEncodedArgs).tuples queueItems = argData[0] queueItemIds: List[int] = argData[1] gridKeys = list(set([i.gridKey for i in queueItems])) coordSetIdByGridKey = {i.gridKey: i.coordSetId for i in queueItems} queueTable = GridKeyCompilerQueue.__table__ gridTable = GridKeyIndexCompiled.__table__ startTime = datetime.now(pytz.utc) session = CeleryDbConn.getDbSession() engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: logger.debug("Staring compile of %s queueItems in %s", len(queueItems), (datetime.now(pytz.utc) - startTime)) total = 0 dispData = _qryDispData(session, gridKeys) conn.execute(gridTable.delete(gridTable.c.gridKey.in_(gridKeys))) transaction.commit() transaction = conn.begin() inserts = [] for gridKey, dispJsonStr in dispData.items(): m = hashlib.sha256() m.update(gridKey.encode()) m.update(dispJsonStr.encode()) gridTupleHash = b64encode(m.digest()).decode() gridTuple = GridTuple( gridKey=gridKey, dispJsonStr=dispJsonStr, lastUpdate=gridTupleHash ) encodedGridTuple = Payload(tuples=[gridTuple]).toEncodedPayload() inserts.append(dict(coordSetId=coordSetIdByGridKey[gridKey], gridKey=gridKey, lastUpdate=gridTupleHash, encodedGridTuple=encodedGridTuple)) if inserts: conn.execute(gridTable.insert(), inserts) logger.debug("Compiled %s gridKeys, %s missing, in %s", len(inserts), len(gridKeys) - len(inserts), (datetime.now(pytz.utc) - startTime)) total += len(inserts) conn.execute(queueTable.delete(queueTable.c.id.in_(queueItemIds))) transaction.commit() logger.info("Compiled and Committed %s GridKeyIndexCompileds in %s", total, (datetime.now(pytz.utc) - startTime)) return gridKeys except NotAllDispsCompiledException as e: logger.warning("Retrying, Not all disps for gridKey %s are compiled", gridKeys) raise self.retry(exc=e, countdown=1) except Exception as e: transaction.rollback() logger.debug("Compile of grids failed, retrying : %s", gridKeys) raise self.retry(exc=e, countdown=2) finally: conn.close() session.close()
def compileLocationIndex(self, payloadEncodedArgs: bytes) -> List[str]: """ Compile Location Index Task :param self: A celery reference to this task :param payloadEncodedArgs: An encoded payload containing the queue tuples. :returns: A list of grid keys that have been updated. """ argData = Payload().fromEncodedPayload(payloadEncodedArgs).tuples queueItems = argData[0] queueItemIds: List[int] = argData[1] indexBuckets = list(set([i.indexBucket for i in queueItems])) modelSetIdByIndexBucket = {i.indexBucket: i.modelSetId for i in queueItems} queueTable = LocationIndexCompilerQueue.__table__ compiledTable = LocationIndexCompiled.__table__ lastUpdate = datetime.now(pytz.utc).isoformat() startTime = datetime.now(pytz.utc) session = CeleryDbConn.getDbSession() engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: logger.debug("Staring compile of %s queueItems in %s", len(queueItems), (datetime.now(pytz.utc) - startTime)) # Get Model Sets modelSetIds = list(set(modelSetIdByIndexBucket.values())) modelSetQry = ( session.query(ModelSet.key, ModelSet.id) .filter(ModelSet.id.in_(modelSetIds)) ) modelSetKeyByModelSetId = {o.id: o.key for o in modelSetQry} total = 0 dispData = _buildIndex(session, indexBuckets) conn.execute(compiledTable.delete( makeCoreValuesSubqueryCondition(engine, compiledTable.c.indexBucket, indexBuckets) )) transaction.commit() transaction = conn.begin() inserts = [] for indexBucket, jsonStr in dispData.items(): modelSetId = modelSetIdByIndexBucket[indexBucket] modelSetKey = modelSetKeyByModelSetId[modelSetId] m = hashlib.sha256() m.update(modelSetKey.encode()) m.update(jsonStr.encode()) dataHash = b64encode(m.digest()).decode() locationIndexTuple = LocationIndexTuple( modelSetKey=modelSetKey, indexBucket=indexBucket, jsonStr=jsonStr, lastUpdate=dataHash ) blobData = Payload(tuples=[locationIndexTuple]).toEncodedPayload() inserts.append(dict(modelSetId=modelSetId, indexBucket=indexBucket, lastUpdate=dataHash, blobData=blobData)) if inserts: conn.execute(compiledTable.insert(), inserts) logger.debug("Compiled %s LocationIndexes, %s missing, in %s", len(inserts), len(indexBuckets) - len(inserts), (datetime.now(pytz.utc) - startTime)) total += len(inserts) conn.execute(queueTable.delete( makeCoreValuesSubqueryCondition(engine, queueTable.c.id, queueItemIds) )) transaction.commit() logger.info("Compiled and Comitted %s LocationIndexCompileds in %s", total, (datetime.now(pytz.utc) - startTime)) return indexBuckets except Exception as e: transaction.rollback() # logger.warning(e) # Just a warning, it will retry logger.exception(e) raise self.retry(exc=e, countdown=2) finally: conn.close() session.close()
def removeBranches(self, modelSetKey: str, coordSetKey: str, keys: List[str]) -> None: """ Remove Branches This worker task removes branches from the indexes. """ startTime = datetime.now(pytz.utc) branchIndexTable = BranchIndex.__table__ queueTable = BranchIndexCompilerQueue.__table__ # Create a lookup of CoordSets by ID dbSession = CeleryDbConn.getDbSession() try: coordSet = dbSession.query(ModelCoordSet) \ .filter(ModelCoordSet.modelSet.key == modelSetKey) \ .filter(ModelCoordSet.key == coordSetKey) \ .one() dbSession.expunge_all() finally: dbSession.close() engine = CeleryDbConn.getDbEngine() conn = engine.connect() transaction = conn.begin() try: items = conn.execute(select( distinct=True, columns=[branchIndexTable.c.id, branchIndexTable.c.chunkKey], whereclause=and_(branchIndexTable.c.key.in_(keys), branchIndexTable.c.coordSetId == coordSet.id) )).fetchall() branchIndexIds = [i.id for i in items] chunkKeys = set([i.chunkKey for i in items]) _deleteBranchDisps(conn, branchIndexIds) # 1) Delete existing branches conn.execute( branchIndexTable.delete(branchIndexTable.c.id.in_(branchIndexIds)) ) # 3) Queue chunks for recompile conn.execute( queueTable.insert(), [dict(modelSetId=coordSet.modelSetId, chunkKey=c) for c in chunkKeys] ) transaction.commit() logger.debug("Deleted %s BranchIndexes queued %s chunks in %s", len(branchIndexIds), len(chunkKeys), (datetime.now(pytz.utc) - startTime)) except Exception as e: transaction.rollback() logger.debug("Retrying createOrUpdateBranches, %s", e) logger.exception(e) raise self.retry(exc=e, countdown=3) finally: conn.close()