def window(data, param): """ MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy) data - list of records """ name = param.name # column to assign window function result edges = param.edges # columns to gourp by where = param.where # DO NOT CONSIDER THESE VALUES sortColumns = param.sort # columns to sort by calc_value = wrap_function(qb_expression_to_function(param.value)) # function that takes a record and returns a value (for aggregation) aggregate = param.aggregate # WindowFunction to apply _range = param.range # of form {"min":-10, "max":0} to specify the size and relative position of window data = filter(data, where) if not aggregate and not edges: if sortColumns: data = sort(data, sortColumns) # SIMPLE CALCULATED VALUE for rownum, r in enumerate(data): r[name] = calc_value(r, rownum, data) return if not aggregate or aggregate == "none": for _, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns) for rownum, r in enumerate(sequence): r[name] = calc_value(r, rownum, sequence) return for keys, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns) for rownum, r in enumerate(sequence): r["__temp__"] = calc_value(r, rownum, sequence) head = coalesce(_range.max, _range.stop) tail = coalesce(_range.min, _range.start) # PRELOAD total total = aggregate() for i in range(tail, head): total.add(sequence[i].__temp__) # WINDOW FUNCTION APPLICATION for i, r in enumerate(sequence): r[name] = total.end() total.add(sequence[i + head].__temp__) total.sub(sequence[i + tail].__temp__) for r in data: r["__temp__"] = None # CLEANUP
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE_FILTER: return data if isinstance(data, Container): return data.filter(where) if isinstance(data, (list, set)): temp = qb_expression_to_function(where) dd = wrap(data) return [d for i, d in enumerate(data) if temp(wrap(d), i, dd)] else: Log.error("Do not know how to handle type {{type}}", type=data.__class__.__name__) try: return drill_filter(where, data) except Exception, _: # WOW! THIS IS INEFFICIENT! return wrap([unwrap(d) for d in drill_filter(where, [DictObject(d) for d in data])])
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) is_join = False # True IF MANY TO MANY JOIN WITH AN EDGE for e in query.edges: if isinstance(e.domain, DefaultDomain): e.domain = SimpleSetDomain(partitions=list(sorted(set(frum.select(e))))) for s in listwrap(query.select): s["exec"] = qb_expression_to_function(s.value) result = { s.name: Matrix( dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=s.aggregate == "count" ) for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum): d = d.copy() coord = [] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: coord.append(get_matches(e, d)) for s in select: mat = result[s.name] agg = s.aggregate var = s.value if agg == "count": for c in itertools.product(*coord): if var == "." or var == None: mat[c] += 1 continue for e, cc in zip(query.edges, c): d[e.name] = cc val = s["exec"](d, c, frum) if val != None: mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error("select aggregate {{agg}} is not recognized", agg= agg) acc = acc(**s) mat[c] = acc for e, cc in zip(query.edges, c): # BECAUSE WE DO NOT KNOW IF s.exec NEEDS THESE EDGES, SO WE PASS THEM ANYWAY d[e.name] = e.domain.partitions[cc] val = s["exec"](d, c, frum) acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() output = Cube(select, query.edges, result) return output
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.as_dict()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = {s.name: Matrix(dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=s.aggregate == "count") for s in select} where = qb_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = qb_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error("select aggregate {{agg}} is not recognized", agg= agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() return Cube(select, query.edges, result)
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.as_dict()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.aggregate == "count") for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = qb_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() return Cube(select, query.edges, result)
def sort(data, fieldnames=None): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sorted(data, value_compare)) fieldnames = listwrap(fieldnames) if len(fieldnames) == 1: fieldnames = fieldnames[0] # SPECIAL CASE, ONLY ONE FIELD TO SORT BY if fieldnames == ".": return wrap(sorted(data)) if isinstance(fieldnames, (basestring, int)): fieldnames = wrap({"value": fieldnames, "sort": 1}) # EXPECTING {"field":f, "sort":i} FORMAT fieldnames.sort = sort_direction.get(fieldnames.sort, 1) fieldnames.value = coalesce(fieldnames.value, fieldnames.field) if fieldnames.value == None: Log.error("Expecting sort to have 'value' attribute") if fieldnames.value == ".": #VALUE COMPARE def _compare_v(l, r): return value_compare(l, r, fieldnames.sort) return DictList([unwrap(d) for d in sorted(data, cmp=_compare_v)]) elif isinstance(fieldnames.value, Mapping): func = qb_expression_to_function(fieldnames.value) def _compare_o(left, right): return value_compare(func(coalesce(left)), func(coalesce(right)), fieldnames.sort) return DictList([unwrap(d) for d in sorted(data, cmp=_compare_o)]) else: def _compare_o(left, right): return value_compare(coalesce(left)[fieldnames.value], coalesce(right)[fieldnames.value], fieldnames.sort) return DictList([unwrap(d) for d in sorted(data, cmp=_compare_o)]) formal = query._normalize_sort(fieldnames) for f in formal: f.func = qb_expression_to_function(f.value) def comparer(left, right): left = coalesce(left) right = coalesce(right) for f in formal: try: result = value_compare(f.func(left), f.func(right), f.sort) if result != 0: return result except Exception, e: Log.error("problem with compare", e) return 0 if isinstance(data, list): output = DictList([unwrap(d) for d in sorted(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = DictList([unwrap(d) for d in sorted(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output
def window(data, param): """ MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy) data - list of records """ name = param.name # column to assign window function result edges = param.edges # columns to gourp by where = param.where # DO NOT CONSIDER THESE VALUES sortColumns = param.sort # columns to sort by calc_value = wrap_function( qb_expression_to_function(param.value) ) # function that takes a record and returns a value (for aggregation) aggregate = param.aggregate # WindowFunction to apply _range = param.range # of form {"min":-10, "max":0} to specify the size and relative position of window data = filter(data, where) if not aggregate and not edges: if sortColumns: data = sort(data, sortColumns) # SIMPLE CALCULATED VALUE for rownum, r in enumerate(data): r[name] = calc_value(r, rownum, data) return if not aggregate or aggregate == "none": for _, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns) for rownum, r in enumerate(sequence): r[name] = calc_value(r, rownum, sequence) return for keys, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns) for rownum, r in enumerate(sequence): r["__temp__"] = calc_value(r, rownum, sequence) head = coalesce(_range.max, _range.stop) tail = coalesce(_range.min, _range.start) # PRELOAD total total = aggregate() for i in range(tail, head): total.add(sequence[i].__temp__) # WINDOW FUNCTION APPLICATION for i, r in enumerate(sequence): r[name] = total.end() total.add(sequence[i + head].__temp__) total.sub(sequence[i + tail].__temp__) for r in data: r["__temp__"] = None # CLEANUP
def list_aggs(frum, query): select = listwrap(query.select) is_join = False # True IF MANY TO MANY JOIN WITH AN EDGE for e in query.edges: if isinstance(e.domain, DefaultDomain): e.domain = SimpleSetDomain( partitions=list(sorted(set(frum.select(e.value))))) for s in listwrap(query.select): s["exec"] = qb_expression_to_function(s.value) result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.aggregate == "count") for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum): d = d.copy() coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: coord.append(get_matches(e, d)) for s in select: mat = result[s.name] agg = s.aggregate var = s.value if agg == "count": for c in itertools.product(*coord): if var == "." or var == None: mat[c] += 1 continue for e, cc in zip(query.edges, c): d[e.name] = cc val = s["exec"](d, c, frum) if val != None: mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc for e, cc in zip( query.edges, c ): # BECAUSE WE DO NOT KNOW IF s.exec NEEDS THESE EDGES, SO WE PASS THEM ANYWAY d[e.name] = e.domain.partitions[cc] val = s["exec"](d, c, frum) acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() output = Cube(select, query.edges, result) return output