def _queueCtreeDisable(self, settings, groupId, sourceName): args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","indexes"), "drop_ctree_%s_closure_indexes.sql" % (sourceName)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Drop %s closure indexes" %(sourceName), None, None, None, json.dumps(args), False) args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","ctree"), "%s_disable.sql" % (sourceName)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Disable %s closure tree" %(sourceName), None, None, None, json.dumps(args), False)
def _queueRemoveDuplicatesTask(self, groupId, stream, specificationName, toleranceLevel, commitFrequency, checkpointBehaviour, removeDuplicates, paths): if removeDuplicates: self.queue.queueCheckpoint(groupId, stream, "major", toleranceLevel, commitFrequency, checkpointBehaviour) args = {} filename = cs.getChimpScriptFilenameToUse(paths["repository"], ("specifications",specificationName,"resources", "sql","import"), "remove_%s_duplicates_from_stage.sql" % (specificationName)) args["filename"] = filename self.queue.queueTask(groupId, stream, "script" , "Remove duplicates", None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, stream, "major", toleranceLevel, commitFrequency, checkpointBehaviour)
def __init__(self, queue, supportConnection, supportCursor, dataConnection, dataCursor, taskId, specification, paths, commitThreshold, appLogger): self.appLogger = appLogger self.commitThreshold = int(commitThreshold) self.queue = queue self.supportConnection = supportConnection self.supportCursor = supportCursor self.dataConnection = dataConnection self.dataCursor = dataCursor self.taskId = taskId self.specification = specification self.paths = paths # Prepare # ======= self.lineCount = 0 self.successCount=0 self.exceptionCount = 0 self.errorCount = 0 self.warningCount = 0 self.noticeCount = 0 self.ignoredCount = 0 self.action = None self.importData = [] self.messageSql = "select shared.add_task_message(%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.transformFunctions={} for thisRecord in specification.records: if thisRecord.useful: moduleFilename = cs.getChimpScriptFilenameToUse(paths["repository"], ("specifications",specification.name,"resources", "py","transformation","stage"), "%s_stage_transformer.py" %(thisRecord.table)) module = imp.load_source("%s_stage_transformer.py" %(thisRecord.table), moduleFilename) self.transformFunctions[thisRecord.table] = module.transformSuppliedValues #Set simple variables for speed if specification.qualifier is not None: self.q = str(specification.qualifier) else: self.q = None self.d = str(specification.delimiter) if len(specification.records) == 1: self.onlyOneRecord = True else: self.onlyOneRecord = False
def queueTasks(queuer, settings, stream, specificationRestriction, groupId, appLogger): appLogger.debug("") appLogger.debug(" Custom column tasks") appLogger.debug(" -------------------") sql = "select specification_name, source_schema,source_name,output_column_list,seq,(select max(seq) from calc.custom_registry as m where m.specification_name=r.specification_name and m.source_schema=r.source_schema and m.source_name=r.source_name) as max_seq from calc.custom_registry as r" if specificationRestriction is not None: sql += " where specification_name in({0})".format(specificationRestriction) sql += " order by specification_name,seq" queuer.supportCursor.execute(sql) specificationCustomSources = queuer.supportCursor.fetchall() for custom in specificationCustomSources: specificationName = custom[0] inputSourceSchema = custom[1] inputSourceName = custom[2] outputCustomList = custom[3].split(",") seq = custom[4] maxSeq = custom[5] processorFilename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications", specificationName,"resources", "py","calculated"), "{0}_calculated_data_processor.py".format(inputSourceName)) processorFilename = processorFilename.replace("\\", "\\\\") args = {} args["inputSourceSchema"] = inputSourceSchema args["inputSourceName"] = inputSourceName args["customList"] = outputCustomList args["processorFilename"] = processorFilename args["flushQueue"] = (seq == maxSeq) queuer.queue.queueTask(groupId, stream, "syncCustomColumn", "Refresh custom columns {0} on {1}".format(outputCustomList,inputSourceName), None, None, None, json.dumps(args), False) appLogger.debug(" syncCustomColumn [{0}]".format(args)) queuer.queue.queueCheckpoint(groupId, stream, "major", settings.args.tolerancelevel, queuer.commitFrequency, queuer.checkpointBehaviour) queuer.supportCursor.connection.commit()
def processSendToImport( self, loopConnection, dataConnection, dataCursor, settings, taskId, processLimit, specification, args ): def getAction(sendMode, identification): if sendMode == "full": action = "insert" elif sendMode == "change": action = identification elif sendMode == "sync": action = "merge" return action def getSendMode(importMode, fileIntent, hasData, appLogger): # Settle on what it is we're doing # # importMode - auto # - full # - change # - sync # # fileIntent - undefined # - full # - change # - mixed # # if importMode == "auto": if fileIntent == "undefined": if hasData: mode = "sync" else: mode = "full" elif fileIntent == "full": mode = "full" elif fileIntent == "change": mode = "change" elif fileIntent == "mixed": print("Imports of mixed file intents not supported") raise elif importMode == "full": mode = "full" elif importMode == "change": mode = "change" elif importMode == "sync": mode = "sync" appLogger.debug( "| {0} (importMode={1} fileIntent={2} hasData={3})".format(mode, importMode, fileIntent, hasData) ) return mode appLogger = settings.appLogger commitThreshold = int(settings.env["dataCommitThreshold"]) table = args["table"] importMode = args["importMode"] fileIntent = args["fileIntent"] strategy = args["strategy"] hasData = args["hasData"] sendMode = getSendMode(importMode, fileIntent, hasData, appLogger) self.queue.startTask(taskId, True) sql = "select count(*) from stage.{0}".format(table) self.supportCursor.execute(sql) scanCount = self.supportCursor.fetchone()[0] self.queue.setScanResults(taskId, scanCount) appLogger.debug("| Scan count = {0}".format(scanCount)) lineCount = 0 successCount = 0 exceptionCount = 0 errorCount = 0 warningCount = 0 noticeCount = 0 ignoredCount = 0 # Grab record for r in specification.records: if r.table == table: record = r appLogger.debug("|") appLogger.debug("| {0}".format(table)) # BUILD DML STATEMENTS FOR THIS RECORD # ------------------------------------ selectColumns = [] insertPlaceholder = "select * from import.{0}_insert(".format(table) insertPlaceholder += "%s,%s" if not record.editable: insertPlaceholder += ",%s,%s" updatePlaceholder = "select * from import.{0}_update(".format(table) updatePlaceholder += "%s" if not record.editable: updatePlaceholder += ",%s,%s" mergePlaceholder = "select * from import.{0}_merge(".format(table) mergePlaceholder += "%s,%s" if not record.editable: mergePlaceholder += ",%s,%s" if record.hasPrimaryKey(): deletePlaceholder = "select * from import.{0}_delete(%s".format(record.table) for column in record.primaryKeyColumns: deletePlaceholder += ",%s" deletePlaceholder += ")" else: deletePlaceholder = None for thisField in record.fields: if thisField.column is not None: selectColumns.append(thisField.column) insertPlaceholder += ",%s" updatePlaceholder += ",%s" mergePlaceholder += ",%s" for thisField in record.additionalFields: insertPlaceholder += ",%s" updatePlaceholder += ",%s" mergePlaceholder += ",%s" insertPlaceholder += ")" updatePlaceholder += ")" mergePlaceholder += ")" # Grab transformer functions moduleFilename = cs.getChimpScriptFilenameToUse( settings.paths["repository"], ("specifications", specification.name, "resources", "py", "transformation", "import"), "{0}_import_transformer.py".format(table), ) module = imp.load_source("{0}_import_transformer.py".format(record.table), moduleFilename) transformer = module.transformSuppliedValues loopSql = "select id,task_id,{0},identification from stage.{1}".format(",".join(selectColumns), table) selectCount = 3 + len(selectColumns) # DEBUG: appLogger.debug("| Pre-computed statements:") appLogger.debug("| loopSql : {0}".format(loopSql)) appLogger.debug("| insertPlaceholder : {0}".format(insertPlaceholder)) appLogger.debug("| updatePlaceholder : {0}".format(updatePlaceholder)) appLogger.debug("| mergePlaceholder : {0}".format(mergePlaceholder)) appLogger.debug("| deletePlaceholder : {0}".format(deletePlaceholder)) # Loop through all staged records loopCursor = loopConnection.makeCursor("loopCursor", True, True) loopCursor.execute(loopSql) for data in loopCursor: if lineCount % 1000 == 0: self.queue.setTaskProgress( taskId, successCount, exceptionCount, errorCount, warningCount, noticeCount, ignoredCount ) lineCount = lineCount + 1 if lineCount % commitThreshold == 0: appLogger.debug("| << Transaction size threshold reached ({0}): COMMIT >>".format(lineCount)) dataConnection.connection.commit() identification = data["identification"] workingRow = data del data[selectCount - 1] workingRow = transformer(dataCursor, workingRow) action = getAction(sendMode, identification) if action == "insert": dataCursor.execute(insertPlaceholder, tuple(workingRow)) elif action == "update": del workingRow[0] dataCursor.execute(updatePlaceholder, tuple(workingRow)) elif action == "delete": None # deleteParams=[] # deleteParams.append(stagedRow[1]) # for thisPkColumn in pkColumnLists[data[0]]: # deleteParams.append(stagedRow[thisPkColumn]) # sql = deletePlaceholders[data[0]] # dataCursor.execute(sql, tuple(deleteParams)) # elif action == "merge": dataCursor.execute(mergePlaceholder, tuple(workingRow)) # warningFlag = False errorFlag = False exceptionFlag = False messages = dataCursor.fetchall() success = True for thisMessage in messages: msgLevel = thisMessage[0] msgCode = thisMessage[1] msgTitle = thisMessage[2] msgAffectedColumns = thisMessage[3] msgAffectedRowCount = thisMessage[4] msgContent = thisMessage[5] self.queue.addTaskMessage( taskId, record.table, lineCount, msgLevel, msgCode, msgTitle, msgAffectedColumns, msgAffectedRowCount, "{0}: {1}".format(msgContent, data), ) if msgLevel == "warning": warningFlag = True success = False elif msgLevel == "error": errorFlag = True success = False elif msgLevel == "exception": exceptionFlag = True success = False elif msgLevel == "notice": noticeCount += 1 if success: successCount = successCount + 1 else: if exceptionFlag: exceptionCount += 1 elif errorFlag: errorCount += 1 elif warningFlag: warningCount += 1 loopCursor.close() return (successCount, exceptionCount, errorCount, warningCount, ignoredCount, noticeCount)
def processSendToEditable( self, loopConnection, dataConnection, dataCursor, settings, taskId, processLimit, specification, args ): commitThreshold = int(settings.env["dataCommitThreshold"]) appLogger = settings.appLogger self.queue.startTask(taskId, True) # Get last time schemas synchronised sql = "select last_sent_to_editable from shared.specification_registry where name=%s" self.supportCursor.execute(sql, (specification.name,)) lastImportTimestamp = self.supportCursor.fetchone()[0] appLogger.debug("| lastImportTimestamp : {0}".format(lastImportTimestamp)) # Grab record table = args["table"] for r in specification.records: if r.table == table: thisRecord = r # Scanning # ======== affectedRecordCount = 0 appLogger.debug("| Scanning {0}:".format(table)) # Count records that have been inserted/updated if lastImportTimestamp is None: sql = "select count(*) from import.%s" % (table) self.supportCursor.execute(sql) else: sql = "select count(*) from import.%s where modified >" % (table) sql = sql + "%s" self.supportCursor.execute(sql, (lastImportTimestamp,)) recordsModified = self.supportCursor.fetchone()[0] appLogger.debug("| {0} (modified)".format(recordsModified)) affectedRecordCount = affectedRecordCount + recordsModified # Count records that have been deleted if lastImportTimestamp is None: sql = "select count(*) from history.import_%s_deletes" % (table) self.supportCursor.execute(sql) else: sql = "select count(*) from history.import_%s_deletes where deleted >" % (table) sql = sql + "%s" self.supportCursor.execute(sql, (lastImportTimestamp,)) recordsModified = self.supportCursor.fetchone()[0] appLogger.debug("| {0} (deleted)".format(recordsModified)) affectedRecordCount = affectedRecordCount + recordsModified appLogger.debug("| affectedRecordCount : {0} (total)".format(affectedRecordCount)) self.queue.setScanResults(taskId, affectedRecordCount) lineCount = 0 successCount = 0 exceptionCount = 0 errorCount = 0 warningCount = 0 noticeCount = 0 ignoredCount = 0 # Fire off the deletes # ==================== appLogger.debug("|") appLogger.debug("| PROCESSING:") appLogger.debug("|") appLogger.debug("| DELETES") appLogger.debug("| {0}".format(thisRecord.table)) sql = "select id from history.import_%s_deletes" % (thisRecord.table) if lastImportTimestamp is None: params = None else: sql = sql + " where deleted > %s" params = (lastImportTimestamp,) deleteDml = "delete from editable.%s" % (thisRecord.table) deleteDml = deleteDml + " where id = %s" loopCursor = loopConnection.makeCursor("loopCursor", True, True) loopCursor.execute(sql, params) for data in loopCursor: if lineCount % 1000 == 0: self.queue.setTaskProgress( taskId, successCount, exceptionCount, errorCount, warningCount, noticeCount, ignoredCount ) lineCount = lineCount + 1 if lineCount % commitThreshold == 0: appLogger.debug("| << Transaction size threshold reached ({0}): COMMIT >>".format(lineCount)) dataConnection.connection.commit() # Decision call to go here deleteAllowed = True if deleteAllowed: successCount = successCount + 1 dataCursor.execute(deleteDml, (data[0],)) else: warningCount = warningCount + 1 loopCursor.connection.commit() # Fire off the inserts/updates # ============================ appLogger.debug("|") appLogger.debug("| INSERT/UPDATE") placeholder = "%s,%s,%s,%s" for thisField in thisRecord.fields: if thisField.column is not None: placeholder = placeholder + ",%s" for thisField in thisRecord.additionalFields: placeholder = placeholder + ",%s" appLogger.debug("| {0}".format(thisRecord.table)) # OPTIMISE: # Is there any data for this record in editable? # If not, then don't bother with the costly merge view. sql = "select exists (select 1 from editable.{0} limit 1)".format(thisRecord.table) self.supportCursor.execute(sql) dataExists = self.supportCursor.fetchone() dataExists = dataExists[0] appLogger.debug("| dataExists: {0}".format(dataExists)) # Build SQL statement to find # all affected records columnList = [] columnList.append("id") if dataExists: columnList.append("editable_record_exists") importSliceStart = 2 else: importSliceStart = 1 importSliceEnd = importSliceStart - 1 for thisField in thisRecord.fields: if thisField.column is not None: columnList.append(thisField.column) importSliceEnd = importSliceEnd + 1 for thisField in thisRecord.additionalFields: columnList.append(thisField.column) importSliceEnd = importSliceEnd + 1 columnList.append("created") columnList.append("modified") if dataExists: for thisField in thisRecord.fields: if thisField.column is not None: columnList.append("e_%s" % (thisField.column)) for thisField in thisRecord.additionalFields: columnList.append("e_%s" % (thisField.column)) columnList.append("e_visibility") columnList.append("e_security") originalEnd = len(columnList) - 1 if dataExists: source = "shared.{0}_to_merge_into_editable".format(thisRecord.table) else: source = "import.{0}".format(thisRecord.table) sql = "select {0} from {1}".format(",".join(columnList), source) if lastImportTimestamp is None: params = None else: sql = sql + " where modified > %s::timestamp" params = (lastImportTimestamp,) # BUILD DML Statements placeholder = "%s,%s,%s,%s" for thisField in thisRecord.fields: if thisField.column is not None: placeholder = placeholder + ",%s" for thisField in thisRecord.additionalFields: placeholder = placeholder + ",%s" insertDml = "select * from editable.%s_insert(%s)" % (thisRecord.table, placeholder) updateDml = "select * from editable.%s_update(%s)" % (thisRecord.table, placeholder) # Grab transformer function moduleFilename = cs.getChimpScriptFilenameToUse( settings.paths["repository"], ("specifications", specification.name, "resources", "py", "transformation", "editable"), "%s_editable_transformer.py" % (thisRecord.table), ) module = imp.load_source("%s_editable_transformer.py" % (thisRecord.table), moduleFilename) transformFunction = module.transformSuppliedValues # Loop through all inserted/updated records appLogger.debug("| loopSql : {0}".format(sql)) appLogger.debug("| insertDml : {0}".format(insertDml)) appLogger.debug("| updateDml : {0}".format(updateDml)) loopCursor = loopConnection.makeCursor("loopCursor", True, True) loopCursor.execute(sql, params) for data in loopCursor: if lineCount % 1000 == 0: self.queue.setTaskProgress( taskId, successCount, exceptionCount, errorCount, warningCount, noticeCount, ignoredCount ) lineCount = lineCount + 1 if lineCount % commitThreshold == 0: appLogger.debug("| << Transaction size threshold reached ({0}): COMMIT >>".format(lineCount)) dataConnection.connection.commit() # Transform values transformedValues = transformFunction(dataCursor, data) # Assemble values to apply applyValues = [data[0], "import"] applyValues.extend(data[importSliceStart : importSliceEnd + 1]) applyValues.extend(transformedValues[originalEnd + 1 :]) if dataExists: if data["editable_record_exists"]: dataCursor.execute(updateDml, applyValues) messages = dataCursor.fetchall() else: dataCursor.execute(insertDml, applyValues) messages = dataCursor.fetchall() else: dataCursor.execute(insertDml, applyValues) messages = dataCursor.fetchall() success = True for thisMessage in messages: msgLevel = thisMessage[0] msgCode = thisMessage[1] msgTitle = thisMessage[2] msgAffectedColumns = thisMessage[3] msgAffectedRowCount = thisMessage[4] msgContent = thisMessage[5] self.queue.addTaskMessage( taskId, thisRecord.table, lineCount, msgLevel, msgCode, msgTitle, msgAffectedColumns, msgAffectedRowCount, "{0}: {1}".format(msgContent, transformedValues), ) if msgLevel == "warning": warningCount += 1 success = False elif msgLevel == "error": errorCount += 1 success = False elif msgLevel == "exception": exceptionCount += 1 success = False elif msgLevel == "notice": noticeCount += 1 if success: successCount = successCount + 1 loopCursor.close() return (successCount, exceptionCount, errorCount, warningCount, ignoredCount, noticeCount)
def makeEditableFile( self, loopConnection, dataConnection, dataCursor, settings, taskId, processLimit, specification, args ): table = args["table"] appLogger = settings.appLogger self.queue.startTask(taskId, True) appLogger.debug("| {0}:".format(table)) # Any editable data here already? # =============================== sql = "select exists (select 1 from editable.{0} limit 1)".format(table) self.supportCursor.execute(sql) dataExists = self.supportCursor.fetchone() dataExists = dataExists[0] appLogger.debug("| dataExists: {0}".format(dataExists)) # Get current timestamp # ===================== sql = "select now()" self.supportCursor.execute(sql) thisImportStartTimestamp = self.supportCursor.fetchone()[0] appLogger.debug("| thisImportStartTimestamp : {0}".format(thisImportStartTimestamp)) # Get last time schemas synchronised # ================================== sql = "select last_sent_to_editable from shared.specification_registry where name=%s" self.supportCursor.execute(sql, (specification.name,)) lastImportTimestamp = self.supportCursor.fetchone()[0] appLogger.debug("| lastImportTimestamp : {0}".format(lastImportTimestamp)) # Scanning # ======== appLogger.debug("| Scanning") # Modified scanSql = "select count(*) from import.{0}".format(table) if lastImportTimestamp is not None: scanSql += " where modified >%s" self.supportCursor.execute(scanSql, (lastImportTimestamp,)) else: self.supportCursor.execute(scanSql) modifiedCount = self.supportCursor.fetchone()[0] appLogger.debug("| Modified = {0}".format(modifiedCount)) scanSql = "select count(*) from history.import_{0}_deletes".format(table) if lastImportTimestamp is not None: scanSql += " where deleted >%s" self.supportCursor.execute(scanSql, (lastImportTimestamp,)) else: self.supportCursor.execute(scanSql) deletedCount = self.supportCursor.fetchone()[0] appLogger.debug("| Deleted = {0}".format(deletedCount)) totalCount = modifiedCount + deletedCount appLogger.debug("| {0}".format(totalCount)) self.queue.setScanResults(taskId, totalCount) # Grab transformer function # ========================= moduleFilename = cs.getChimpScriptFilenameToUse( settings.paths["repository"], ("specifications", specification.name, "resources", "py", "transformation", "editable"), "%s_editable_transformer.py" % (table), ) module = imp.load_source("%s_editable_transformer.py" % (table), moduleFilename) transformFunction = module.transformSuppliedValues # Establish files # =============== filename = os.path.join(settings.env["tempPath"], "insert_into_editable_{0}.sql".format(table)) appLogger.debug("|") appLogger.debug("| Filename: {0}".format(filename)) insertFile = open(filename, "w") # Calculate DML placeholders # ========================== insertDml = "execute editable.{0}_insert(%s,%s".format(table) i = args["selectListLength"] while i > 0: insertDml += ",%s" i -= 1 insertDml += ',"import");' appLogger.debug("| insertDml : {0}".format(insertDml)) loopSql = "select {0} from import.{1}".format(args["selectList"], table) loopCursor = loopConnection.makeCursor("loopCursor", True, True) loopCursor.execute(loopSql) lineCount = 0 successCount = 0 exceptionCount = 0 errorCount = 0 warningCount = 0 noticeCount = 0 ignoredCount = 0 if not dataExists: for data in loopCursor: if lineCount % 1000 == 0: self.queue.setTaskProgress( taskId, successCount, exceptionCount, errorCount, warningCount, noticeCount, ignoredCount ) lineCount = lineCount + 1 transformedValues = transformFunction(dataCursor, data) quoted = str(psycopg2.extensions.adapt(transformedValues).getquoted()) quoted = quoted[8:-2] line = "select editable.{0}_insert({1}, 'import');\n".format(table, quoted) insertFile.write(line) successCount += 1 # line = self.supportCursor.mogrify(insertDml,transformedValues) insertFile.close() loopCursor.close() appLogger.debug("| Finished.") self.supportConnection.connection.commit() return (successCount, exceptionCount, errorCount, warningCount, ignoredCount, noticeCount)
def processSolrDocuments(queue, supportConnection, supportCursor, loopConnection, dataConnection, dataCursor, settings, taskId, processLimit, args): # Init lineCount = 0 successCount = 0 exceptionCount=0 errorCount=0 warningCount=0 noticeCount=0 ignoredCount=0 appLogger = settings.appLogger commitThreshold = int(settings.env["dataCommitThreshold"]) messageSql = "select shared.add_task_message(%s,%s,%s,%s,%s,%s,%s,%s,%s)" documentName = args["documentName"] serverName = args["serverName"] fieldCount = args["fieldCount"] - 1 filename = "{0}_document_formatter.py".format(documentName) moduleToUse = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ["specifications", args["specification"], "resources", "py", "solr_formatting"], filename) module = imp.load_source(filename, moduleToUse) conversionFunctions = module.DocumentFormatter() conversionFunction = conversionFunctions.getSolrDocument # Publish count queue.startTask(taskId, True) sql = "select count(*) from {0}.{1}_solr_document_queue_view".format(CALC_SCHEMA, documentName) supportCursor.execute(sql) documentCount = supportCursor.fetchone()[0] queue.setScanResults(taskId, documentCount) appLogger.info(" | documentCount : {0}".format(documentCount)) sql = "select exists(select 1 from {0}.{1} where document_type=%s limit 1)".format(SOLR_SCHEMA, serverName) supportCursor.execute(sql,(documentName,)) documentsExist = supportCursor.fetchone()[0] appLogger.info(" | documentsExist: {0}".format(documentsExist)) # Apply applySql = "select * from {0}.apply_{1}(".format(SOLR_SCHEMA, serverName) i=0 while i<fieldCount: applySql += "%s," i += 1 # if documentsExist: # applySql += "true)" #else: # applySql += "false)" applySql += "false)" appLogger.info(" | applySql : {0}".format(applySql)) # Establish main loop loopSql = "select * from {1}.{2}_solr_document_queue_view as a".format(None, CALC_SCHEMA, documentName) appLogger.info(" | loopSql : {0}".format(loopSql)) # Flushing if documentsExist: loopCursor = loopConnection.makeCursor("solrFlush", True, True) loopCursor.execute(loopSql) appLogger.info(" | Flushing:") flushDml = "delete from {0}.{1} where document_type=%s and document_key=%s".format(SOLR_SCHEMA, serverName); appLogger.info(" | flushDml: {0}".format(flushDml)) for record in loopCursor: solrDocument = None solrDocument = conversionFunction(supportCursor, record) appLogger.info(" | {0}".format(solrDocument[2])); dataCursor.execute(flushDml, (documentName, solrDocument[2])) loopCursor.close() loopCursor = loopConnection.makeCursor("solr", True, True) loopCursor.execute(loopSql) lineCount=0 # Truncate table truncateDml = "delete from {0}.{1}_solr_document_queue".format(CALC_SCHEMA, documentName) appLogger.info(" | truncateDml : {0}".format(truncateDml)) for record in loopCursor: if lineCount%1000 ==0: queue.setTaskProgress(taskId, successCount, 0, 0, 0, 0, 0) lineCount=lineCount+1 if lineCount % commitThreshold == 0: appLogger.debug("| << Transaction size threshold reached ({0}): COMMIT >>".format(lineCount)) dataConnection.connection.commit() try: solrDocument = None solrDocument = conversionFunction(supportCursor, record) dataCursor.execute(applySql, solrDocument) messages = dataCursor.fetchall() messagesFound = False raisedWarning = False raisedError = False raisedException=False for thisMessage in messages: messagesFound = True messageLevel = thisMessage[0] messageCode = thisMessage[1] messageTitle = thisMessage[2] messageAffectedColumns = thisMessage[3] messageAffectedRowCount = thisMessage[4] messageContent = "{0}\n\nDocument data being applied:\n{1}".format(thisMessage[5], solrDocument) supportCursor.execute(messageSql, (taskId, None, lineCount, messageLevel, messageCode, messageTitle, messageAffectedColumns, messageAffectedRowCount, messageContent)) if messageLevel=="warning": raisedWarning = True elif messageLevel=="error": raisedError = True elif messageLevel=="exception": raisedException = True elif messageLevel=="notice": noticeCount = noticeCount + 1 if messagesFound: if raisedException: exceptionCount = exceptionCount +1 elif raisedError: errorCount = errorCount +1 elif raisedWarning: warningCount = warningCount +1 else: successCount = successCount+1 else: successCount = successCount+1 except Exception as detail: exceptionCount = exceptionCount + 1 if exceptionCount < 4: print('Error processing Solr document (see logs)') appLogger.error(" |") appLogger.error(" | EXCEPTION PROCESSING SOLR DOCUMENT") appLogger.error(" | Filename: {0} ({1})".format(filename, moduleToUse)) appLogger.error(" | ConversionFunction: {0}".format(conversionFunction)) appLogger.error(" | ApplySql: {0}".format(applySql)) appLogger.error(" | {0}".format(str(detail))) appLogger.error(" | Record: {0}".format(record)) appLogger.error(" | SolrDocument: {0}".format(solrDocument)) appLogger.error(" |") queue.addTaskMessage(taskId, None, i, "exception", "EXP", "Exception processing SolrDocument", None, 1, "ERROR: {0} RECORD: {1}".format(detail, record)) loopCursor.close() if (exceptionCount > 0 or errorCount > 0): dataConnection.connection.rollback() else: dataCursor.execute(truncateDml) queue.finishTask(taskId, successCount, exceptionCount, errorCount, warningCount, noticeCount, ignoredCount) return( (successCount, exceptionCount, errorCount, warningCount, ignoredCount, noticeCount) )
def queueTasks(queuer, settings, stream, specificationRestriction, groupId, appLogger): appLogger.debug("") appLogger.debug(" Pin tasks") appLogger.debug(" ---------") sql = "select specification_name, pin_name, input_id_column, input_x_column, input_y_column, input_schema, input_source_name, input_column_list, output_column_list, where_clause from calc.pin_registry" if specificationRestriction is not None: sql += " where specification_name in ({0})".format(specificationRestriction) queuer.supportCursor.execute(sql) specificationPins = queuer.supportCursor.fetchall() for pin in specificationPins: specificationName = pin[0] pinName = pin[1] appLogger.debug(" * {0}".format(pinName)) sql = "select pinhead.%s_exists()" %(pinName) queuer.supportCursor.execute(sql) pinsExist = queuer.supportCursor.fetchone()[0] if not pinsExist: args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",specificationName,"resources","sql","indexes"), "drop_pinhead_%s_indexes.sql" % (pinName)) appLogger.debug(" No pins... drop via '{0}'".format(filename)) args["filename"] = filename queuer.queue.queueTask(groupId, stream, "script" , "Drop %s pin indexes" %(pinName), None, None, None, json.dumps(args), False) queuer.queue.queueCheckpoint(groupId, stream, "major", queuer.toleranceLevel, queuer.commitFrequency, queuer.checkpointBehaviour) # [0] specification_name # [1] pin_name # [2] input_id_column # [3] input_x_column # [4] input_y_column # [5] input_schema # [6] input_source_name # [7] input_column_list # [8] output_column_list # [9] where_clause # [10]processing_script_location sourceName = pin[6] args = {} args["pinName"] = pinName args["inputIdColumn"] = pin[2] args["inputXColumn"] = pin[3] args["inputYColumn"] = pin[4] args["inputSchema"] = pin[5] args["inputSourceName"] = sourceName args["inputColumnList"] = pin[7] args["outputColumnList"] = pin[8] args["whereClause"] = pin[9] processorFilename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications", specificationName, "resources", "py","calculated"), "{0}_calculated_data_processor.py".format(sourceName)) processorFilename = processorFilename.replace("\\", "\\\\") args["processorFilename"] = processorFilename queuer.queue.queueTask(groupId, stream, "syncPins", "Refresh %s pins" %(pinName), None, None, None, json.dumps(args), False) appLogger.debug(" syncPins [{0}]".format(args)) if not pinsExist: args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications", specificationName, "resources", "sql","indexes"), "create_pinhead_%s_indexes.sql" % (pinName)) appLogger.debug(" Rebuild pins... via '{0}'".format(filename)) args["filename"] = filename queuer.queue.queueTask(groupId, stream, "script" , "Build %s pin indexes" %(pinName), None, None, None, json.dumps(args), False) queuer.queue.queueCheckpoint(groupId, stream, "major", settings.args.tolerancelevel, queuer.commitFrequency, queuer.checkpointBehaviour) queuer.supportCursor.connection.commit()
def _queueFinishStageTask(self, groupId, stream, specificationName, paths): args = {} filename = cs.getChimpScriptFilenameToUse(paths["repository"], ("specifications",specificationName,"resources", "sql","import"), "post_%s_staging.sql" % (specificationName)) args["filename"] = filename self.queue.queueTask(groupId, stream, "script" , "Finish stage", None, None, None, json.dumps(args), False)
def queueImport(self, groupId): settings = self.settings if settings.specification.dedicatedStagingAreaName is None: nativeStageSchema = "stage" else: nativeStageSchema = settings.specification.dedicatedStagingAreaName enableMv = False enableCtree = False # self.stream = settings.args.streamname # self.specificationName = settings.specification.name # # supportConnection = settings.db.makeConnection("support") # supportCursor = supportConnection.makeCursor("supportCursor", False, False) # self.commitFrequency = settings.args.commitfrequency # self.checkpointBehaviour = settings.args.checkpointbehaviour self.importMode = settings.args.importmode #(supportConnection, supportCursor) = settings.db.makeConnection("support", False, False) self.removeDuplicates = settings.specification.autoRemoveStageDuplicates # =============== # [1] Queue files # =============== if settings.args.json is not None: (queuedTasks, minTaskId, maxTaskId) = self._queueJSON(groupId, settings.specification, self.stream, self.specificationName, settings.args.limit, settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour, settings.paths, self.removeDuplicates, self.importMode) fileIntent="undefined" elif settings.specification.sourceType=="csv": (queuedTasks, fileIntent, minTaskId, maxTaskId) = self._queueCsvFiles(groupId, settings.specification, self.stream, self.specificationName, settings.args.limit, settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour, settings.args.files, settings.paths, self.removeDuplicates,settings.args.recurse, settings.args.filenameregex, self.importMode) elif settings.specification.sourceType=="external": (queuedTasks, minTaskId, maxTaskId) = self._queueExternalLoaderFiles(groupId, self.stream, self.specificationName, settings.specification.externalLoaderName, nativeStageSchema, settings.specification.externalLoaderProfile, settings.specification.externalLoaderVariables, settings.args.limit,settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour, settings.args.files, settings.paths, settings.db.credentials, settings.env, self.removeDuplicates, settings.args.recurse, settings.args.filenameregex, self.importMode) fileIntent="full" args = {} args["specification"] = self.specificationName # ======================= sql = "select import.%s_exists()" %(self.specificationName) self.supportCursor.execute(sql) hasData = self.supportCursor.fetchone()[0] if not hasData: # ADD RECORD INDEX DROPS for thisRecord in settings.specification.records: if thisRecord.useful: args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications", self.specificationName, "resources", "sql", "indexes"), "drop_import_%s_indexes.sql" % (thisRecord.table)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Drop import.%s indexes" %(thisRecord.table), None, None, None, json.dumps(args), False) # ADD CHECKPOINT self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # ADD ENTITY RECORD INDEX DROPS AND DISABLE for thisEntity in settings.specification.entities: enableMv = True args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources","sql","indexes"), "drop_mv_%s_indexes.sql" % (thisEntity.name)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Drop %s mv indexes" %(thisEntity.name), None, None, None, json.dumps(args), False) args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql", "mv"), "%s_disable.sql" % (thisEntity.name)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Disable %s mv" %(thisEntity.name), None, None, None,json.dumps(args), False) # ADD CHECKPOINT if enableMv: self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # ADD CTREE INDEX DROPS AND DISABLE for thisRecord in settings.specification.records: if thisRecord.useful: if thisRecord.hasCtree(): enableCtree = True self._queueCtreeDisable(settings, groupId, thisRecord.table) for thisEntity in settings.specification.entities: if thisEntity.hasCtree(): enableCtree = True self._queueCtreeDisable(settings, groupId, thisEntity.name) if enableCtree: self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # ADD SENT TO IMPORT for record in settings.specification.records: if record.useful: args = {} args["specification"] = self.specificationName args["importMode"] = self.importMode args["fileIntent"] = fileIntent args["strategy"] = "speed" args["table"] = record.table args["hasData"]=hasData self.queue.queueTask(groupId, self.stream, "sendtoimport" , "Send '{0}' to import".format(record.table), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) self.queue.queueAVacuum(settings.args.vacuumstrategy, groupId, self.stream, "import", record.table) # If we're in sync mode then we may need to delete some things if self.importMode=="sync": for record in settings.specification.records: if record.useful: args = {} args["specification"] = self.specificationName args["importMode"] = self.importMode args["fileIntent"] = fileIntent args["minTaskId"] = minTaskId args["maxTaskId"] = maxTaskId args["table"] = record.table args["hasData"]=hasData self.queue.queueTask(groupId, self.stream, "importsyncdeletes" , "Process '{0}' sync deletes".format(record.table), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) self.queue.queueAVacuum(settings.args.vacuumstrategy, groupId, self.stream, "import", record.table) committedForIndexes=False if not hasData: for thisRecord in settings.specification.records: if thisRecord.useful: if not committedForIndexes: committedForIndexes = True self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # ADD INDEXES args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","indexes"), "create_import_%s_indexes.sql" % (thisRecord.table)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Create import.%s indexes" %(thisRecord.table), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # ================================ args = None atLeastOneEditable = False for quickCheck in settings.specification.records: if quickCheck.editable: atLeastOneEditable = True #================= if atLeastOneEditable: sql = "select editable.%s_exists()" %(self.specificationName) self.supportCursor.execute(sql) hasData = self.supportCursor.fetchone()[0] if not hasData: for thisRecord in settings.specification.records: if thisRecord.useful: args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications", self.specificationName, "resources", "sql","indexes"), "drop_editable_%s_indexes.sql" % (thisRecord.table)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Drop editable.%s indexes" %(thisRecord.table), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) firstEditable=True for record in settings.specification.records: if record.useful: if firstEditable: firstEditable = False args = {} self.queue.queueTask(groupId, self.stream, "recordtimestamp" , "Record current timestamp", None, None, None, json.dumps(args), False) args = {} args["specification"] = self.specificationName args["table"] = record.table args["hasData"]=hasData self.queue.queueTask(groupId, self.stream, "sendtoeditable" , "Make '{0}' editable".format(record.table), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) self.queue.queueAVacuum(settings.args.vacuumstrategy, groupId, self.stream, "editable", record.table) args = {} args["specification"] = self.specificationName self.queue.queueTask(groupId, self.stream, "finisheditable" , "Finish send to editable process", None, None, None, json.dumps(args), False) if not hasData: for thisRecord in settings.specification.records: if thisRecord.useful: args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","indexes"), "create_editable_%s_indexes.sql" % (thisRecord.table)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Create editable.%s indexes" %(thisRecord.table), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) #====================== if enableCtree: for thisRecord in settings.specification.records: if thisRecord.useful: if thisRecord.hasCtree(): self._queueCtreeEnable(settings, groupId, thisRecord.table) for thisEntity in settings.specification.entities: if thisEntity.hasCtree(): self._queueCtreeEnable(settings, groupId, thisEntity.name) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) #====================== for thisRecord in settings.specification.records: if thisRecord.useful: if thisRecord.hasCtree(): if thisRecord.editable: schemaRestriction="editable" else: schemaRestriction="import" queueCtree.queueTasks(self, settings, schemaRestriction, self.stream, "'{0}'".format(self.specificationName), groupId, settings.appLogger) #====================== if enableMv: self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) for thisEntity in settings.specification.entities: args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","mv"), "%s_enable_and_recreate.sql" % (thisEntity.name)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Enable %s mv" %(thisEntity.name), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","indexes"), "create_mv_%s_indexes.sql" % (thisEntity.name)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Create %s indexes" %(thisEntity.name), None, None, None, json.dumps(args), False) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # if enableCtree: # for thisRecord in settings.specification.records: # if thisRecord.useful: # if thisRecord.ancestorColumn is not None or thisRecord.descendantColumn is not None: # enableCtree = True # args = {} # filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specification files",self.specificationName,"sql","ctree"), "%s_enable_and_recreate.sql" % (thisRecord.table)) # args["filename"] = filename # self.queue.queueTask(groupId, self.stream, "script" , "Build %s closure tree" %(thisRecord.table), None, None, None, json.dumps(args), False) # # self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # OLD SEARCH WENT HERE # if settings.args.chainsearch: # sql = "select domain_name,source_type,source_schema,source_name,specification_name,last_synchronized,config_location from search.active_sources where specification_name=%s" # self.supportCursor.execute(sql, (self.specificationName,)) # sources = self.supportCursor.fetchall() # # domains=[] # for thisSource in sources: # if thisSource[0] not in domains: # domains.append(thisSource[0]) # # # domainsToRebuild=[] # for thisDomain in domains: # sql = "select search.is_there_any_%s_data()" %(thisDomain) # self.supportCursor.execute(sql, (self.specificationName,)) # hasData = self.supportCursor.fetchone()[0] # if not hasData: # domainsToRebuild.append(thisDomain) # args = {} # filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("search domain files",thisDomain,"sql","indexes"), "drop_search_%s_indexes.sql" % (thisDomain)) # args["filename"] = filename # self.queue.queueTask(groupId, self.stream, "script" , "Drop search.%s indexes" %(thisDomain), None, None, None, json.dumps(args), False) # for thisSource in sources: # args = {} # args["domainName"] = thisSource[0] # args["sourceType"] = thisSource[1] # args["sourceSchema"] = thisSource[2] # args["sourceName"] = thisSource[3] # args["specification"] = thisSource[4] # args["lastSynchronized"] = thisSource[5] # args["configLocation"] = thisSource[6] # args["recordLimit"] = None # self.queue.queueTask(groupId, self.stream, "syncSearchSource" , "Refresh %s (%s)" %(thisSource[0], thisSource[3]), None, None, None, json.dumps(args), False) # # for thisDomain in domainsToRebuild: # args = {} # filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("search domain files",thisDomain,"sql","indexes"), "create_search_%s_indexes.sql" % (thisDomain)) # args["filename"] = filename # self.queue.queueTask(groupId, self.stream, "script" , "Create search.%s indexes" %(thisDomain), None, None, None, json.dumps(args), False) # # self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) # OLD PINHEAD WENT HERE # ======================================================================= # Queue calculated data tasks for this specification # for record in settings.specification.records: # if record.useful: # record.computedData.addTasks(settings, self, groupId, self.stream) # for entity in settings.specification.entities: # entity.computedData.addTasks(settings, self, groupId, self.stream) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) self.queue.queueAVacuum(settings.args.vacuumstrategy, groupId, self.stream, None, None) self.queue.queueCheckpoint(groupId, self.stream, "major", settings.args.tolerancelevel, self.commitFrequency, self.checkpointBehaviour) self.supportCursor.connection.commit()
def _queueCtreeEnable(self, settings, groupId, sourceName): args = {} filename = cs.getChimpScriptFilenameToUse(settings.paths["repository"], ("specifications",self.specificationName,"resources", "sql","ctree"), "%s_enable_and_recreate.sql" % (sourceName)) args["filename"] = filename self.queue.queueTask(groupId, self.stream, "script" , "Build %s closure tree" %(sourceName), None, None, None, json.dumps(args), False)