def __init__( self, host, user, password, table, meta, # REDSHIFT COPY COMMAND REQUIRES A BUCKET TO HOLD PARAMETERS database=None, port=5439, settings=None ): self.settings = settings self.db = Redshift(settings) INDEX_CACHE[settings.table] = wrap({"name":settings.table}) # HACK TO GET parse_columns TO WORK columns = parse_columns(settings.table, settings.mapping.test_result.properties) nested = [c.name for c in columns if c.type == "nested"] self.columns = wrap([c for c in columns if c.type not in ["object"] and not any(c.name.startswith(n+".") for n in nested)]) try: self.db.execute(""" CREATE TABLE {{table_name}} ( "_id" character varying UNIQUE, {{columns}} )""", { "table_name": self.db.quote_column(settings.table), "columns": SQL(",\n".join(self.db.quote_column(c.name) + " " + self.db.es_type2pg_type(c.type) for c in self.columns)) }, retry=False) except Exception, e: if "already exists" in e: Log.alert("Table {{table}} exists in Redshift", table= settings.table) else: Log.error("Could not make table", e)
def __init__( self, index, # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME type, # SCHEMA NAME alias=None, explore_metadata=True, # PROBING THE CLUSTER FOR METADATA IS ALLOWED timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) debug=False, # DO NOT SHOW THE DEBUG STATEMENTS settings=None): if index == None or type == None: Log.error("not allowed") if index == alias: Log.error("must have a unique index name") self.cluster_state = None self.cluster_metadata = None self.debug = debug if self.debug: Log.alert("elasticsearch debugging for index {{index}} is on", index=settings.index) self.settings = settings self.cluster = Cluster(settings) try: index = self.get_index(index) if index and alias == None: settings.alias = settings.index settings.index = index if index == None: Log.error("not allowed") except Exception, e: # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER pass
def __init__( self, index, # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME type, # SCHEMA NAME alias=None, explore_metadata=True, # PROBING THE CLUSTER FOR METADATA IS ALLOWED timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) debug=False, # DO NOT SHOW THE DEBUG STATEMENTS settings=None ): if index==None or type==None: Log.error("not allowed") if index == alias: Log.error("must have a unique index name") self.cluster_state = None self.cluster_metadata = None self.debug = debug if self.debug: Log.alert("elasticsearch debugging for index {{index}} is on", index= settings.index) self.settings = settings self.cluster = Cluster(settings) try: index = self.get_index(index) if index and alias==None: settings.alias = settings.index settings.index = index if index==None: Log.error("not allowed") except Exception, e: # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER pass
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(Thread.STOP)) while not please_stop: c = self.todo.pop() if c == Thread.STOP: break if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
def _monitor(self, please_stop): self.service.wait() if DEBUG: Log.alert( "{{name}} stopped with returncode={{returncode}}", name=self.name, returncode=self.service.returncode ) self.stdin.add(Thread.STOP) self.service_stopped.go()
def _monitor(self, please_stop): self.service.wait() if self.debug: Log.alert("{{name}} stopped with returncode={{returncode}}", name=self.name, returncode=self.service.returncode) self.stdin.add(Thread.STOP) self.service_stopped.go()
def try_till_response(self, *args, **kwargs): while True: try: response = self.server.get(*args, **kwargs) return response except Exception, e: e = Except.wrap(e) if "No connection could be made because the target machine actively refused it" in e: Log.alert("Problem connecting") else: Log.error("Server raised exception", e)
def rollback(self): if self.pending: pending, self.pending = self.pending, [] for p in pending: m = Message() m.set_body(p.get_body()) self.queue.write(m) for p in pending: self.queue.delete_message(p) if self.settings.debug: Log.alert("{{num}} messages returned to queue", num=len(pending))
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) branches = _get_branches_from_hg(settings.hg) es = elasticsearch.Cluster(settings=settings.hg.branches).get_or_create_index(settings=settings.hg.branches) es.add_alias() es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches) Log.alert("DONE!") except Exception, e: Log.error("Problem with etl", e)
def __init__( self, bucket, # NAME OF THE BUCKET aws_access_key_id=None, # CREDENTIAL aws_secret_access_key=None, # CREDENTIAL region=None, # NAME OF AWS REGION, REQUIRED FOR SOME BUCKETS public=False, debug=False, settings=None ): self.uid = None self.bucket = s3.Bucket(settings=settings) Log.alert("Using {{bucket}} for S3 storage", bucket=self.bucket.name) self.temp_queue = PersistentQueue(bucket + "_queue.txt") self._figure_out_start_point() self.push_to_s3 = Thread.run("pushing to " + bucket, self._worker)
def __init__( self, alias, # NAME OF THE ALIAS type=None, # SCHEMA NAME, WILL HUNT FOR ONE IF None explore_metadata=True, # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED debug=False, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) settings=None ): self.debug = debug if self.debug: Log.alert("Elasticsearch debugging on {{index|quote}} is on", index= settings.index) if alias == None: Log.error("Alias can not be None") self.settings = settings self.cluster = Cluster(settings) if type == None: if not explore_metadata: Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.") indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: alias_list = self.cluster.get("/_alias/"+self.settings.index) candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] full_name = jx.sort(candidates, 0).last()[0] index = self.cluster.get("/" + full_name + "/_mapping")[full_name] else: index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) max_prop = -1 for _type, mapping in index.mappings.items(): if _type == "_default_": continue num_prop = len(mapping.properties.keys()) if max_prop < num_prop: max_prop = num_prop self.settings.type = _type type = _type if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) self.path = "/" + alias + "/" + type
def __init__( self, host, user, password, table, meta, # REDSHIFT COPY COMMAND REQUIRES A BUCKET TO HOLD PARAMETERS database=None, port=5439, settings=None): self.settings = settings self.db = Redshift(settings) INDEX_CACHE[settings.table] = wrap( {"name": settings.table}) # HACK TO GET parse_columns TO WORK columns = parse_columns(settings.table, settings.mapping.test_result.properties) nested = [c.name for c in columns if c.type == "nested"] self.columns = wrap([ c for c in columns if c.type not in ["object"] and not any( c.name.startswith(n + ".") for n in nested) ]) try: self.db.execute(""" CREATE TABLE {{table_name}} ( "_id" character varying UNIQUE, {{columns}} )""", { "table_name": self.db.quote_column(settings.table), "columns": SQL(",\n".join( self.db.quote_column(c.name) + " " + self.db.es_type2pg_type(c.type) for c in self.columns)) }, retry=False) except Exception, e: if "already exists" in e: Log.alert("Table {{table}} exists in Redshift", table=settings.table) else: Log.error("Could not make table", e)
def main(): try: settings = startup.read_settings() constants.set(settings.constants) Log.start(settings.debug) branches = get_branches(settings.hg) es = elasticsearch.Cluster( settings=settings.hg.branches).get_or_create_index( settings=settings.hg.branches) es.add_alias() es.extend({ "id": b.name + " " + b.locale, "value": b } for b in branches) Log.alert("DONE!") except Exception, e: Log.error("Problem with etl", e)
def __init__( self, service_url, # location of the ActiveData server we are testing backend_es, # the ElasticSearch settings for filling the backend fastTesting=False, settings=None): if backend_es.schema == None: Log.error("Expecting backed_es to have a schema defined") letters = unicode(ascii_lowercase) self.random_letter = letters[int(Date.now().unix / 30) % 26] self.service_url = service_url self.backend_es = backend_es self.settings = settings self._es_test_settings = None self._es_cluster = None self._index = None if not containers.config.default: containers.config.default = { "type": "elasticsearch", "settings": backend_es } if not fastTesting: self.server = http else: Log.alert( "TESTS WILL RUN FAST, BUT NOT ALL TESTS ARE RUN!\nEnsure the `file://tests/config/elasticsearch.json#fastTesting=true` to turn on the network response tests." ) # WE WILL USE THE ActiveServer CODE, AND CONNECT TO ES DIRECTLY. # THIS MAKES FOR SLIGHTLY FASTER TEST TIMES BECAUSE THE PROXY IS # MISSING self.server = FakeHttp() containers.config.default = { "type": "elasticsearch", "settings": settings.backend_es.copy() }
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(Thread.STOP)) while not please_stop: c = self.todo.pop() if c == Thread.STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
def parse_short_desc(bug): parts = bug.short_desc.split("|") if len(parts) in [2, 3]: bug.result.test = parts[0].strip() bug.result.message = parts[1].strip() elif any(map(parts[0].strip().endswith, [".html", ".py", ".js", ".xul"])) and len(parts)>2: bug.result.test = parts[0].strip() bug.result.message = parts[1].strip() elif len(parts) >= 4: set_default(bug.result, parse_status(parts[0])) bug.result.test = parts[1].strip() bug.result.message = parts[3].strip() elif any(black in bug.short_desc for black in blacklist): Log.note("IGNORED {{line}}", line=bug.short_desc) elif bug.bug_id in [1165765]: Log.note("IGNORED {{line}}", line=bug.short_desc) elif "###" in bug.short_desc: bug.short_desc = bug.short_desc.replace("###", " | ") parse_short_desc(bug) else: Log.alert("can not handle {{bug_id}}: {{line}}", line=bug.short_desc, bug_id=bug.bug_id) if bug.result.test.lower().startswith("intermittent "): bug.result.test = bug.result.test[13:]
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [ w for w in self.settings.workers if w.source.bucket == bucket ] if not work_actions: Log.note( "No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket=source_block.bucket, message=source_block, action="skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources( [multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action=action.name, source=source_block.bucket, key=source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys( prefix=source_block.key) new_keys = set( action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error( "expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert( "Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys=old_keys, source_key=source_key) continue elif not new_keys: Log.alert( "Expecting some new keys after processing {{source_key}}", old_keys=old_keys, source_key=source_key) continue for k in new_keys: if len(k.split(".") ) == 3 and action.destination.type != "test_result": Log.error( "two dots have not been needed yet, this is a consitency check" ) delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list=sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance( action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add( Dict(bucket=action.destination.bucket, key=k)) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": for k in action._source.list( prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket == "ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err( "Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)
buildbot_summary.etl = { "id": 0, "name": "unittest", "timestamp": Date.now().unix, "source": etl_header, "type": "join", "revision": get_git_revision(), "duration": timer.duration } buildbot_summary.run.stats = summary.stats buildbot_summary.run.stats.duration = summary.stats.end_time - summary.stats.start_time if DEBUG: age = Date.now() - Date(buildbot_summary.run.stats.start_time) if age > Duration.DAY: Log.alert("Test is {{days|round(decimal=1)}} days old", days=age / Duration.DAY) Log.note("Done\n{{data|indent}}", data=buildbot_summary.run.stats) new_keys = [] new_data = [] if not summary.tests: key = source_key + ".0" new_keys.append(key) new_data.append({"id": key, "value": buildbot_summary}) else: for i, t in enumerate(summary.tests): key = source_key + "." + unicode(i) new_keys.append(key)
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) return if c.table == "meta.tables": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/"+es_index+"/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value) if cardinality == None: Log.error("logic error") query = Dict(size=0) if c.type in ["object", "nested"]: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}} result = self.default_es.post("/"+es_index+"/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = qb.sort(aggs._nested.buckets.key) else: parts = qb.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith("testing"): Log.alert("{{col.table}} does not exist", col=c) else: self.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)
output_bytes = convert.unicode2utf8(convert.value2json(result)) return wrap({ "status_code": 200, "all_content": output_bytes, "content": output_bytes }) global_settings = jsons.ref.get("file://tests/config/elasticsearch.json") constants.set(global_settings.constants) NEXT = 0 container_types = Dict(elasticsearch=ESUtils, sqlite=SQLiteUtils) # read_alternate_settings utils = None try: filename = os.environ.get("TEST_CONFIG") if filename: global_settings = jsons.ref.get("file://" + filename) else: Log.alert( "No TEST_CONFIG environment variable to point to config file. Using /tests/config/elasticsearch.json" ) if not global_settings.use: Log.error('Must have a {"use": type} set in the config file') utils = container_types[global_settings.use](global_settings) except Exception, e: Log.warning("problem", e)
def _dispatch_work(self, source_block): """ source_block POINTS TO THE bucket AND key TO PROCESS :return: False IF THERE IS NOTHING LEFT TO DO """ source_keys = listwrap(coalesce(source_block.key, source_block.keys)) if not isinstance(source_block.bucket, basestring): # FIX MISTAKE source_block.bucket = source_block.bucket.bucket bucket = source_block.bucket work_actions = [w for w in self.settings.workers if w.source.bucket == bucket] if not work_actions: Log.note("No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}", bucket= source_block.bucket, message= source_block, action= "skipping" if self.settings.keep_unknown_on_queue else "deleting") return not self.settings.keep_unknown_on_queue for action in work_actions: try: source_key = unicode(source_keys[0]) if len(source_keys) > 1: multi_source = action._source source = ConcatSources([multi_source.get_key(k) for k in source_keys]) source_key = MIN(source_key) else: source = action._source.get_key(source_key) source_key = source.key Log.note("Execute {{action}} on bucket={{source}} key={{key}}", action= action.name, source= source_block.bucket, key= source_key) if action.transform_type == "bulk": old_keys = set() else: old_keys = action._destination.keys(prefix=source_block.key) new_keys = set(action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop)) #VERIFY KEYS if len(new_keys) == 1 and list(new_keys)[0] == source_key: pass # ok else: etls = map(key2etl, new_keys) etls = qb.sort(etls, "id") for i, e in enumerate(etls): if i != e.id: Log.error("expecting keys to have dense order: {{ids}}", ids=etls.id) #VERIFY KEYS EXIST if hasattr(action._destination, "get_key"): for k in new_keys: action._destination.get_key(k) for n in action._notify: for k in new_keys: n.add(k) if action.transform_type == "bulk": continue # DUE TO BUGS THIS INVARIANT IS NOW BROKEN # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?) # for n in new_keys: # if not n.startswith(source_key): # Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})", new_key= n, source_key= source_key) if not new_keys and old_keys: Log.alert("Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}", old_keys= old_keys, source_key= source_key) continue elif not new_keys: Log.alert("Expecting some new keys after processing {{source_key}}", old_keys= old_keys, source_key= source_key) continue for k in new_keys: if len(k.split(".")) == 3 and action.destination.type!="test_result": Log.error("two dots have not been needed yet, this is a consitency check") delete_me = old_keys - new_keys if delete_me: if action.destination.bucket == "ekyle-test-result": for k in delete_me: action._destination.delete_key(k) else: Log.note("delete keys?\n{{list}}", list= sorted(delete_me)) # for k in delete_me: # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER # AND NOT GOING TO AN S3 BUCKET if not action._notify and isinstance(action._destination, (aws.s3.Bucket, S3Bucket)): for k in old_keys | new_keys: self.work_queue.add(Dict( bucket=action.destination.bucket, key=k )) except Exception, e: if "Key {{key}} does not exist" in e: err = Log.warning elif "multiple keys in {{bucket}}" in e: err = Log.warning if source_block.bucket=="ekyle-test-result": for k in action._source.list(prefix=key_prefix(source_key)): action._source.delete_key(strip_extension(k.key)) elif "expecting keys to have dense order" in e: err = Log.warning if source_block.bucket=="ekyle-test-result": # WE KNOW OF THIS ETL MISTAKE, REPROCESS self.work_queue.add({ "key": unicode(key_prefix(source_key)), "bucket": "ekyle-pulse-logger" }) elif "Expecting a pure key" in e: err = Log.warning else: err = Log.error err("Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", { "action": action.name, "source": source_block.bucket, "key": source_key, "destination": coalesce(action.destination.name, action.destination.index) }, e)
class Index(Features): """ AN ElasticSearch INDEX LIFETIME MANAGEMENT TOOL ElasticSearch'S REST INTERFACE WORKS WELL WITH PYTHON AND JAVASCRIPT SO HARDLY ANY LIBRARY IS REQUIRED. IT IS SIMPLER TO MAKE HTTP CALLS DIRECTLY TO ES USING YOUR FAVORITE HTTP LIBRARY. I HAVE SOME CONVENIENCE FUNCTIONS HERE, BUT IT'S BETTER TO MAKE YOUR OWN. THIS CLASS IS TO HELP DURING ETL, CREATING INDEXES, MANAGING ALIASES AND REMOVING INDEXES WHEN THEY HAVE BEEN REPLACED. IT USES A STANDARD SUFFIX (YYYYMMDD-HHMMSS) TO TRACK AGE AND RELATIONSHIP TO THE ALIAS, IF ANY YET. """ @use_settings def __init__( self, index, # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME type=None, # SCHEMA NAME, (DEFAULT TO TYPE IN INDEX, IF ONLY ONE) alias=None, explore_metadata=True, # PROBING THE CLUSTER FOR METADATA IS ALLOWED read_only=True, tjson=False, # STORED AS TYPED JSON timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) debug=False, # DO NOT SHOW THE DEBUG STATEMENTS settings=None ): if index==None: Log.error("not allowed") if index == alias: Log.error("must have a unique index name") self.cluster_state = None self.debug = debug self.settings = settings self.cluster = Cluster(settings) try: full_index = self.get_index(index) if full_index and alias==None: settings.alias = settings.index settings.index = full_index if full_index==None: Log.error("not allowed") if type == None: # NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT? with self.cluster.metadata_locker: index_ = self.cluster._metadata.indices[self.settings.index] if not index_: indices = self.cluster.get_metadata().indices index_ = indices[self.settings.index] candidate_types = list(index_.mappings.keys()) if len(candidate_types) != 1: Log.error("Expecting `type` parameter") self.settings.type = type = candidate_types[0] except Exception, e: # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER Log.error("not expected", cause=e) if not type: Log.error("not allowed") self.path = "/" + full_index + "/" + type if self.debug: Log.alert("elasticsearch debugging for {{url}} is on", url=self.url)
class Cluster(object): @use_settings def __new__(cls, host, port=9200, settings=None): if not isinstance(port, int): Log.error("port must be integer") cluster = known_clusters.get((host, port)) if cluster: return cluster cluster = object.__new__(cls) known_clusters[(host, port)] = cluster return cluster @use_settings def __init__(self, host, port=9200, explore_metadata=True, settings=None): """ settings.explore_metadata == True - IF PROBING THE CLUSTER FOR METADATA IS ALLOWED settings.timeout == NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) """ if hasattr(self, "settings"): return self.settings = settings self.cluster_state = None self._metadata = None self.metadata_locker = Lock() self.debug = settings.debug self.version = None self.path = settings.host + ":" + unicode(settings.port) self.get_metadata() @use_settings def get_or_create_index( self, index, alias=None, schema=None, limit_replicas=None, read_only=False, tjson=False, settings=None ): best = self._get_best(settings) if not best: output = self.create_index(settings=settings, schema=schema, limit_replicas=limit_replicas) return output elif best.alias != None: settings.alias = best.alias settings.index = best.index elif settings.alias == None: settings.alias = settings.index settings.index = best.index index = settings.index meta = self.get_metadata() columns = parse_properties(index, [], meta.indices[index].mappings.values()[0].properties) if len(columns)!=0: settings.tjson = tjson or any(c.name.endswith("$value") for c in columns) return Index(settings) def _get_best(self, settings): from pyLibrary.queries import jx aliases = self.get_aliases() indexes = jx.sort([ a for a in aliases if (a.alias == settings.index and settings.alias == None) or (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or (a.index == settings.index and (a.alias == None or a.alias == settings.alias)) ], "index") return indexes.last() @use_settings def get_index(self, index, type=None, alias=None, read_only=True, settings=None): """ TESTS THAT THE INDEX EXISTS BEFORE RETURNING A HANDLE """ if read_only: # GET EXACT MATCH, OR ALIAS aliases = self.get_aliases() if index in aliases.index: return Index(settings) if index in aliases.alias: match = [a for a in aliases if a.alias == index][0] settings.alias = match.alias settings.index = match.index return Index(settings) Log.error("Can not find index {{index_name}}", index_name=settings.index) else: # GET BEST MATCH, INCLUDING PROTOTYPE best = self._get_best(settings) if not best: Log.error("Can not find index {{index_name}}", index_name=settings.index) if best.alias != None: settings.alias = best.alias settings.index = best.index elif settings.alias == None: settings.alias = settings.index settings.index = best.index return Index(settings) def get_alias(self, alias): """ RETURN REFERENCE TO ALIAS (MANY INDEXES) USER MUST BE SURE NOT TO SEND UPDATES """ aliases = self.get_aliases() if alias in aliases.alias: settings = self.settings.copy() settings.alias = alias settings.index = alias return Index(read_only=True, settings=settings) Log.error("Can not find any index with alias {{alias_name}}", alias_name= alias) def get_prototype(self, alias): """ RETURN ALL INDEXES THAT ARE INTENDED TO BE GIVEN alias, BUT HAVE NO ALIAS YET BECAUSE INCOMPLETE """ output = sort([ a.index for a in self.get_aliases() if re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and not a.alias ]) return output @use_settings def create_index( self, index, alias=None, schema=None, limit_replicas=None, read_only=False, tjson=False, settings=None ): if not settings.alias: settings.alias = settings.index index = settings.index = proto_name(settings.alias) if settings.alias == index: Log.error("Expecting index name to conform to pattern") if settings.schema_file: Log.error('schema_file attribute not supported. Use {"$ref":<filename>} instead') if schema == None: Log.error("Expecting a schema") elif isinstance(schema, basestring): schema = convert.json2value(schema, leaves=True) else: schema = convert.json2value(convert.value2json(schema), leaves=True) if limit_replicas: # DO NOT ASK FOR TOO MANY REPLICAS health = self.get("/_cluster/health") if schema.settings.index.number_of_replicas >= health.number_of_nodes: Log.warning("Reduced number of replicas: {{from}} requested, {{to}} realized", {"from": schema.settings.index.number_of_replicas}, to= health.number_of_nodes - 1 ) schema.settings.index.number_of_replicas = health.number_of_nodes - 1 self.post( "/" + index, data=schema, headers={"Content-Type": "application/json"} ) # CONFIRM INDEX EXISTS while True: try: state = self.get("/_cluster/state") if index in state.metadata.indices: break Log.note("Waiting for index {{index}} to appear", index=index) except Exception, e: Log.warning("Problem while waiting for index {{index}} to appear", index=index, cause=e) Thread.sleep(seconds=1) Log.alert("Made new index {{index|quote}}", index=index) es = Index(settings=settings) return es