def format_table(T, select, query=None): data = [] num_columns = (Math.MAX(select.put.index) + 1) for row in T: r = [None] * num_columns for s in select: value = unwraplist(row[s.pull]) if value == None: continue index, child = s.put.index, s.put.child if child == ".": r[index] = value else: if r[index] is None: r[index] = Dict() r[index][child] = value data.append(r) header = [None] * num_columns for s in select: if header[s.put.index]: continue header[s.put.index] = s.name return Dict( meta={"format": "table"}, header=header, data=data )
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum output = DictList() for e in wrap(coalesce(query.edges, query.groupby, [])): if e.value != None and not isinstance(e.value, NullOp): e = e.copy() vars_ = e.value.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) e.value = e.value.map({schema[v].name: schema[v].es_column for v in vars_}) elif e.range: e = e.copy() min_ = e.range.min max_ = e.range.max vars_ = min_.vars() | max_.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) map_ = {schema[v].name: schema[v].es_column for v in vars_} e.range = { "min": min_.map(map_), "max": max_.map(map_) } elif e.domain.dimension: vars_ = e.domain.dimension.fields e.domain.dimension = e.domain.dimension.copy() e.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(e.domain.partitions.where): vars_ = set() for p in e.domain.partitions: vars_ |= p.where.vars() try: depths = set(len(schema[v].nested_path)-1 for v in vars_) if -1 in depths: Log.error( "Do not know of column {{column}}", column=unwraplist([v for v in vars_ if schema[v]==None]) ) if len(depths) > 1: Log.error("expression {{expr}} spans tables, can not handle", expr=e.value) max_depth = Math.MAX(depths) while len(output) <= max_depth: output.append([]) except Exception, e: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) limit = 0 output[max_depth].append(AggsDecoder(e, query, limit))
def diff(settings, please_stop=None): # EVERYTHING FROM ELASTICSEARCH es = MultiDayIndex(settings.elasticsearch, queue_size=100000) in_es = get_all_in_es(es) in_s3 = get_all_s3(in_es, settings) # IGNORE THE 500 MOST RECENT BLOCKS, BECAUSE THEY ARE PROBABLY NOT DONE in_s3 = in_s3[500:500 + settings.limit:] Log.note( "Queueing {{num}} keys (from {{min}} to {{max}}) for insertion to ES", num=len(in_s3), min=Math.MIN(in_s3), max=Math.MAX(in_s3)) work_queue = aws.Queue(settings=settings.work_queue) work_queue.extend(in_s3)
def get_branches(hg, branches, use_cache=True, settings=None): if not settings.branches or not use_cache: found_branches = _get_branches_from_hg(hg) es = elasticsearch.Cluster(settings=branches).get_or_create_index( settings=branches) es.add_alias() es.extend({ "id": b.name + " " + b.locale, "value": b } for b in found_branches) es.flush() return found_branches # TRY ES try: es = elasticsearch.Cluster(settings=branches).get_index( settings=branches) query = {"query": {"match_all": {}}, "size": 20000} docs = es.search(query).hits.hits._source # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(Math.MAX(docs.etl.timestamp)) if Date.now() - oldest > OLD_BRANCH: return get_branches(use_cache=False, settings=settings) try: return UniqueIndex(["name", "locale"], data=docs, fail_on_dup=False) except Exception, e: Log.error("Bad branch in ES index", cause=e) except Exception, e: if "Can not find index " in e: return get_branches(use_cache=False, settings=settings) Log.error("problem getting branches", cause=e)