def main(settings): #USE A FILE if settings.source.filename != None: settings.destination.alias = settings.destination.index settings.destination.index = ElasticSearch.proto_name(settings.destination.alias) schema = CNV.JSON2object(File(settings.source.schema_filename).read()) if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments.")) dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) # GET LAST UPDATED time_file = File(settings.param.last_replication_time) from_file = None if time_file.exists: from_file = CNV.milli2datetime(CNV.value2int(time_file.read())) from_es = get_last_updated(destination) last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0)) current_time = datetime.utcnow() pending = get_pending(source, last_updated) with ThreadedQueue(destination, size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(CNV.datetime2milli(current_time)))
Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema=CNV.JSON2object(schema, paths=True) schema.settings=jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema=CNV.JSON2object(comment_schema, paths=True) comment_schema.settings=jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name(settings.es_comments.alias) es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True) File(settings.param.first_run_time).write(unicode(CNV.datetime2milli(current_run_time))) return current_run_time, es, es_comments, last_run_time
Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema = CNV.JSON2object(schema, paths=True) schema.settings = jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema = CNV.JSON2object(comment_schema, paths=True) comment_schema.settings = jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name( settings.es_comments.alias) es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True)