def get_branches(settings): # GET MAIN PAGE response = http.get(settings.url) doc = BeautifulSoup(response.all_content) all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = get_branch(settings, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) return branches
def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add( f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error( "Dangerous to select into more than one branch at time" ) if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def __or__(self, other): output = UniqueIndex(self._keys) for v in self: output.add(v) for v in other: output.add(v) return output
def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add(f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN(len(deep_path), len(path)) if path[:short:] != deep_path[:short:]: Log.error("Dangerous to select into more than one branch at time") if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def _extend(self, key, documents): #TODO: FIND OUT IF THIS FUNCTION IS EVER USED (TALOS MAYBE?) if self.bucket.name == "ekyle-test-result": #TODO: PUT THIS LOGIC ELSEWHERE (LIKE settings) WE DO NOT CARE WHAT'S IN THE BUCKET, OVERWRITE ALL self.bucket.write_lines(key, map(convert.value2json, documents)) return meta = self.bucket.get_meta(key) if meta != None: documents = UniqueIndex(keys="etl.id", data=documents) try: content = self.bucket.read_lines(key) old_docs = UniqueIndex(keys="etl.id", data=map(convert.json2value, content)) except Exception, _: # OLD FORMAT (etl header, followed by list of records) old_docs = UniqueIndex(keys="etl.id") residual = old_docs - documents overlap = old_docs & documents # IS IT CHEAPER TO SEE IF THERE IS A DIFF, RATHER THAN WRITE NEW DATA TO S3? # CAN NOT PERFORM FUZZY MATCH, THE etl PROPERTY WILL HAVE CHANGED # fuzzytestcase.assertAlmostEqual(documents._data, overlap._data) if residual: documents = documents | residual
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], basestring): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = ("value", ) self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if isinstance(desc.partitions, list): self.partitions = desc.partitions.copy() else: Log.error("expecting a list of partitions")
def _get_branches_from_hg(settings): # GET MAIN PAGE response = http.get(settings.url) doc = BeautifulSoup(response.all_content) all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = _get_single_branch_from_hg(settings, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/mozilla-beta" # THIS IS THE for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) # b.url = "https://hg.mozilla.org/releases/mozilla-aurora" return branches
def test_double_key(self): data = [{"a": 1, "b": "w"}, {"a": 2, "b": "x"}, {"a": 3, "b": "y"}, {"a": 4, "b": "z"}] i = UniqueIndex(["a", "b"], data=data) s = UniqueIndex(["a", "b"]) s.add({"a": 4, "b": "x"}) self.assertEqual(i - s, data) self.assertEqual(i | s, i | s) self.assertEqual(i & s, [])
def unique_index(data, keys=None, fail_on_dup=True): """ RETURN dict THAT USES KEYS TO INDEX DATA ONLY ONE VALUE ALLOWED PER UNIQUE KEY """ o = UniqueIndex(listwrap(keys), fail_on_dup=fail_on_dup) for d in data: try: o.add(d) except Exception, e: o.add(d) Log.error("index {{index}} is not unique {{key}} maps to both {{value1}} and {{value2}}", index= keys, key= select([d], keys)[0], value1= o[d], value2= d, cause=e )
def _get_managed_instances(ec2_conn, name): requests = UniqueIndex(["instance_id"], data=_get_managed_spot_requests(ec2_conn, name).filter(lambda r: r.instance_id != None)) reservations = ec2_conn.get_all_instances() output = [] for res in reservations: for instance in res.instances: if instance.tags.get('Name', '').startswith(name) and instance._state.name == "running": instance.request = requests[instance.id] output.append(dictwrap(instance)) return wrap(output)
def get_branches(self): if not self.settings.repo: from testlog_etl import etl_hg_branch return etl_hg_branch.get_branches( settings={"url": "https://hg.mozilla.org"}) #TRY ES es = elasticsearch.Index(settings=self.settings.branches) query = {"query": {"match_all": {}}, "size": 20000} docs = es.search(query).hits.hits._source for d in docs: d.name = d.name.lower() try: return UniqueIndex(["name", "locale"], data=docs, fail_on_dup=False) except Exception, e: Log.error("Bad branch in ES index", cause=e)
def get_branches(hg, branches, use_cache=True, settings=None): if not settings.branches or not use_cache: found_branches = _get_branches_from_hg(hg) es = elasticsearch.Cluster(settings=branches).get_or_create_index( settings=branches) es.add_alias() es.extend({ "id": b.name + " " + b.locale, "value": b } for b in found_branches) es.flush() return found_branches # TRY ES try: es = elasticsearch.Cluster(settings=branches).get_index( settings=branches) query = {"query": {"match_all": {}}, "size": 20000} docs = es.search(query).hits.hits._source # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(Math.MAX(docs.etl.timestamp)) if Date.now() - oldest > OLD_BRANCH: return get_branches(use_cache=False, settings=settings) try: return UniqueIndex(["name", "locale"], data=docs, fail_on_dup=False) except Exception, e: Log.error("Bad branch in ES index", cause=e) except Exception, e: if "Can not find index " in e: return get_branches(use_cache=False, settings=settings) Log.error("problem getting branches", cause=e)
def unique_index(data, keys=None, fail_on_dup=True): """ RETURN dict THAT USES KEYS TO INDEX DATA ONLY ONE VALUE ALLOWED PER UNIQUE KEY """ o = UniqueIndex(listwrap(keys), fail_on_dup=fail_on_dup) for d in data: try: o.add(d) except Exception, e: o.add(d) Log.error( "index {{index}} is not unique {{key}} maps to both {{value1}} and {{value2}}", index=keys, key=select([d], keys)[0], value1=o[d], value2=d, cause=e)
def __sub__(self, other): output = UniqueIndex(self._keys) for v in self: if v not in other: output.add(v) return output
def _get_branches_from_hg(settings): # GET MAIN PAGE response = http.get(settings.url) doc = BeautifulSoup(response.all_content) all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = _get_single_branch_from_hg(settings, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/mozilla-beta" # THIS IS THE for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) # b.url = "https://hg.mozilla.org/releases/mozilla-aurora" for b in list(branches): if b.name.startswith("mozilla-esr"): branches.add(set_default({"name": "release-" + b.name}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/" + b.name #CHECKS for b in branches: if b.name != b.name.lower(): Log.error("Expecting lowercase name") if not b.locale: Log.error("Not expected") if not b.url.startswith("http"): Log.error("Expecting a valid url") if not b.etl.timestamp: Log.error("Expecting a timestamp") return branches
def es_deepop(es, query): columns = query.frum.get_columns(query.frum.name) query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted( columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_to_es_columns = {c.name: c.es_column for c in columns} map_to_local = { c.name: "_inner" + c.es_column[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.es_column) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, query.frum, map_to_es_columns) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap( simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), { "not": { "nested": { "path": query_path, "filter": { "match_all": {} } } } } ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.es_column[ len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column) i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": { "name": literal_field(c.name), "index": i, "child": "." } }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip( ".") not in col_names: n.name = n.put.name = n.name.lstrip(".") else: column = s.term.value.var + "." prefix = len(column) for c in columns: if c.name.startswith(column) and c.type not in [ "object", "nested" ]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": { "name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "." } }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": { "name": ".", "index": i, "child": c.es_column } }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": { "name": s.name, "index": i, "child": "." } }) i += 1 else: column = columns[(s.value.var, )] parent = column.es_column + "." prefix = len(parent) net_columns = [ c for c in columns if c.es_column.startswith(parent) and c.type not in ["object", "nested"] ] if not net_columns: pull = get_pull(column) if not column.nested_path: es_query.fields += [column.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(column.nested_path)[0], "put": { "name": s.name, "index": i, "child": "." } }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if not n.nested_path: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": { "name": s.name, "index": i, "child": n.es_column[prefix:] } }) i += 1 else: expr = s.value for v in expr.vars(): for n in columns: if n.name == v: if not n.nested_path: es_query.fields += [n.es_column] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression( expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": { "name": s.name, "index": i, "child": "." } }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append( es09.util.post(es, Dict(filter=more_filter, fields=es_query.fields), query.limit)) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and isinstance(desc.partitions[0], (basestring, Number)): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and len(set(desc.partitions.value)) == len( desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = list(desc.partitions) else: Log.error("expecting a list of partitions")