Python qb_expression_to_function示例，pyLibrary.queries.expressions.qb_expression_to_function Python示例

示例#1

0

显示文件

文件： qb.py 项目： klahnakoski/MoDevETL

def window(data, param):
    """
    MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy)
    data - list of records
    """
    name = param.name            # column to assign window function result
    edges = param.edges          # columns to gourp by
    where = param.where          # DO NOT CONSIDER THESE VALUES
    sortColumns = param.sort            # columns to sort by
    calc_value = wrap_function(qb_expression_to_function(param.value)) # function that takes a record and returns a value (for aggregation)
    aggregate = param.aggregate  # WindowFunction to apply
    _range = param.range          # of form {"min":-10, "max":0} to specify the size and relative position of window

    data = filter(data, where)

    if not aggregate and not edges:
        if sortColumns:
            data = sort(data, sortColumns)
        # SIMPLE CALCULATED VALUE
        for rownum, r in enumerate(data):
            r[name] = calc_value(r, rownum, data)
        return

    if not aggregate or aggregate == "none":
        for _, values in groupby(data, edges.value):
            if not values:
                continue     # CAN DO NOTHING WITH THIS ZERO-SAMPLE

            sequence = sort(values, sortColumns)

            for rownum, r in enumerate(sequence):
                r[name] = calc_value(r, rownum, sequence)
        return

    for keys, values in groupby(data, edges.value):
        if not values:
            continue     # CAN DO NOTHING WITH THIS ZERO-SAMPLE

        sequence = sort(values, sortColumns)

        for rownum, r in enumerate(sequence):
            r["__temp__"] = calc_value(r, rownum, sequence)

        head = coalesce(_range.max, _range.stop)
        tail = coalesce(_range.min, _range.start)

        # PRELOAD total
        total = aggregate()
        for i in range(tail, head):
            total.add(sequence[i].__temp__)

        # WINDOW FUNCTION APPLICATION
        for i, r in enumerate(sequence):
            r[name] = total.end()
            total.add(sequence[i + head].__temp__)
            total.sub(sequence[i + tail].__temp__)

    for r in data:
        r["__temp__"] = None  # CLEANUP

示例#2

0

显示文件

文件： qb.py 项目： klahnakoski/MoDevETL

def filter(data, where):
    """
    where  - a function that accepts (record, rownum, rows) and returns boolean
    """
    if len(data) == 0 or where == None or where == TRUE_FILTER:
        return data

    if isinstance(data, Container):
        return data.filter(where)

    if isinstance(data, (list, set)):
        temp = qb_expression_to_function(where)
        dd = wrap(data)
        return [d for i, d in enumerate(data) if temp(wrap(d), i, dd)]
    else:
        Log.error("Do not know how to handle type {{type}}", type=data.__class__.__name__)

    try:
        return drill_filter(where, data)
    except Exception, _:
        # WOW!  THIS IS INEFFICIENT!
        return wrap([unwrap(d) for d in drill_filter(where, [DictObject(d) for d in data])])

示例#3

0

显示文件

文件： aggs.py 项目： klahnakoski/MoDevETL

def list_aggs(frum, query):
    frum = wrap(frum)
    select = listwrap(query.select)

    is_join = False  # True IF MANY TO MANY JOIN WITH AN EDGE
    for e in query.edges:
        if isinstance(e.domain, DefaultDomain):
            e.domain = SimpleSetDomain(partitions=list(sorted(set(frum.select(e)))))

    for s in listwrap(query.select):
        s["exec"] = qb_expression_to_function(s.value)

    result = {
        s.name: Matrix(
            dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges],
            zeros=s.aggregate == "count"
        )
        for s in select
    }
    where = qb_expression_to_function(query.where)
    for d in filter(where, frum):
        d = d.copy()
        coord = []  # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE
        for e in query.edges:
            coord.append(get_matches(e, d))

        for s in select:
            mat = result[s.name]
            agg = s.aggregate
            var = s.value
            if agg == "count":
                for c in itertools.product(*coord):
                    if var == "." or var == None:
                        mat[c] += 1
                        continue

                    for e, cc in zip(query.edges, c):
                        d[e.name] = cc
                    val = s["exec"](d, c, frum)
                    if val != None:
                        mat[c] += 1
            else:
                for c in itertools.product(*coord):
                    acc = mat[c]
                    if acc == None:
                        acc = windows.name2accumulator.get(agg)
                        if acc == None:
                            Log.error("select aggregate {{agg}} is not recognized",  agg= agg)
                        acc = acc(**s)
                        mat[c] = acc
                    for e, cc in zip(query.edges, c):  # BECAUSE WE DO NOT KNOW IF s.exec NEEDS THESE EDGES, SO WE PASS THEM ANYWAY
                        d[e.name] = e.domain.partitions[cc]
                    val = s["exec"](d, c, frum)
                    acc.add(val)

    for s in select:
        if s.aggregate == "count":
            continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    output = Cube(select, query.edges, result)
    return output

示例#4

0

显示文件

文件： aggs.py 项目： klahnakoski/MoDevETL

def cube_aggs(frum, query):
    select = listwrap(query.select)

    #MATCH EDGES IN QUERY TO ONES IN frum
    for e in query.edges:
        for fs in frum.select:
            if fs.name == e.value:
                Log.error("Not implemented yet")
        if isinstance(e.domain, DefaultDomain):
            # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum
            for fe in frum.edges:
                if fe.name == e.value:
                    e.domain = SimpleSetDomain(**fe.domain.as_dict())
                    e.value = e.value + "." + fe.domain.key
                    break
        else:
            for fe in frum.edges:
                if fe.name == e.value:
                    e.value = e.value + "." + fe.domain.key
                    break


    result = {s.name: Matrix(dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=s.aggregate == "count") for s in select}
    where = qb_expression_to_function(query.where)
    for d in filter(where, frum.values()):
        coord = []  # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE
        for e in query.edges:
            matches = get_matches(e, d)
            coord.append(matches)
            if len(matches) == 1 and d[e.name] == None:
                d[e.name] = e.domain.partitions[matches[0]]

        for s in select:
            mat = result[s.name]
            agg = s.aggregate
            var = s.value
            expr = qb_expression_to_function(var)
            val = expr(d)
            if agg == "count":
                if var == "." or var == None:
                    for c in itertools.product(*coord):
                        mat[c] += 1
                    continue

                if val != None:
                    for c in itertools.product(*coord):
                        mat[c] += 1
            else:
                for c in itertools.product(*coord):
                    acc = mat[c]
                    if acc == None:
                        acc = windows.name2accumulator.get(agg)
                        if acc == None:
                            Log.error("select aggregate {{agg}} is not recognized",  agg= agg)
                        acc = acc(**s)
                        mat[c] = acc
                    acc.add(val)

    for s in select:
        if s.aggregate == "count":
            continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    return Cube(select, query.edges, result)

示例#5

0

显示文件

文件： aggs.py 项目： mozilla/ActiveData-ETL

def cube_aggs(frum, query):
    select = listwrap(query.select)

    #MATCH EDGES IN QUERY TO ONES IN frum
    for e in query.edges:
        for fs in frum.select:
            if fs.name == e.value:
                Log.error("Not implemented yet")
        if isinstance(e.domain, DefaultDomain):
            # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum
            for fe in frum.edges:
                if fe.name == e.value:
                    e.domain = SimpleSetDomain(**fe.domain.as_dict())
                    e.value = e.value + "." + fe.domain.key
                    break
        else:
            for fe in frum.edges:
                if fe.name == e.value:
                    e.value = e.value + "." + fe.domain.key
                    break

    result = {
        s.name: Matrix(dims=[
            len(e.domain.partitions) + (1 if e.allowNulls else 0)
            for e in query.edges
        ],
                       zeros=s.aggregate == "count")
        for s in select
    }
    where = qb_expression_to_function(query.where)
    for d in filter(where, frum.values()):
        coord = [
        ]  # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE
        for e in query.edges:
            matches = get_matches(e, d)
            coord.append(matches)
            if len(matches) == 1 and d[e.name] == None:
                d[e.name] = e.domain.partitions[matches[0]]

        for s in select:
            mat = result[s.name]
            agg = s.aggregate
            var = s.value
            expr = qb_expression_to_function(var)
            val = expr(d)
            if agg == "count":
                if var == "." or var == None:
                    for c in itertools.product(*coord):
                        mat[c] += 1
                    continue

                if val != None:
                    for c in itertools.product(*coord):
                        mat[c] += 1
            else:
                for c in itertools.product(*coord):
                    acc = mat[c]
                    if acc == None:
                        acc = windows.name2accumulator.get(agg)
                        if acc == None:
                            Log.error(
                                "select aggregate {{agg}} is not recognized",
                                agg=agg)
                        acc = acc(**s)
                        mat[c] = acc
                    acc.add(val)

    for s in select:
        if s.aggregate == "count":
            continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    return Cube(select, query.edges, result)

示例#6

0

显示文件

文件： qb.py 项目： klahnakoski/MoDevETL

def sort(data, fieldnames=None):
    """
    PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction}
    """
    try:
        if data == None:
            return Null

        if not fieldnames:
            return wrap(sorted(data, value_compare))

        fieldnames = listwrap(fieldnames)
        if len(fieldnames) == 1:
            fieldnames = fieldnames[0]
            # SPECIAL CASE, ONLY ONE FIELD TO SORT BY
            if fieldnames == ".":
                return wrap(sorted(data))
            if isinstance(fieldnames, (basestring, int)):
                fieldnames = wrap({"value": fieldnames, "sort": 1})

            # EXPECTING {"field":f, "sort":i} FORMAT
            fieldnames.sort = sort_direction.get(fieldnames.sort, 1)
            fieldnames.value = coalesce(fieldnames.value, fieldnames.field)
            if fieldnames.value == None:
                Log.error("Expecting sort to have 'value' attribute")

            if fieldnames.value == ".":
                #VALUE COMPARE
                def _compare_v(l, r):
                    return value_compare(l, r, fieldnames.sort)
                return DictList([unwrap(d) for d in sorted(data, cmp=_compare_v)])
            elif isinstance(fieldnames.value, Mapping):
                func = qb_expression_to_function(fieldnames.value)
                def _compare_o(left, right):
                    return value_compare(func(coalesce(left)), func(coalesce(right)), fieldnames.sort)
                return DictList([unwrap(d) for d in sorted(data, cmp=_compare_o)])
            else:
                def _compare_o(left, right):
                    return value_compare(coalesce(left)[fieldnames.value], coalesce(right)[fieldnames.value], fieldnames.sort)
                return DictList([unwrap(d) for d in sorted(data, cmp=_compare_o)])

        formal = query._normalize_sort(fieldnames)
        for f in formal:
            f.func = qb_expression_to_function(f.value)

        def comparer(left, right):
            left = coalesce(left)
            right = coalesce(right)
            for f in formal:
                try:
                    result = value_compare(f.func(left), f.func(right), f.sort)
                    if result != 0:
                        return result
                except Exception, e:
                    Log.error("problem with compare", e)
            return 0

        if isinstance(data, list):
            output = DictList([unwrap(d) for d in sorted(data, cmp=comparer)])
        elif hasattr(data, "__iter__"):
            output = DictList([unwrap(d) for d in sorted(list(data), cmp=comparer)])
        else:
            Log.error("Do not know how to handle")
            output = None

        return output

示例#7

0

显示文件

文件： qb.py 项目： mozilla/ActiveData-ETL

def window(data, param):
    """
    MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy)
    data - list of records
    """
    name = param.name  # column to assign window function result
    edges = param.edges  # columns to gourp by
    where = param.where  # DO NOT CONSIDER THESE VALUES
    sortColumns = param.sort  # columns to sort by
    calc_value = wrap_function(
        qb_expression_to_function(param.value)
    )  # function that takes a record and returns a value (for aggregation)
    aggregate = param.aggregate  # WindowFunction to apply
    _range = param.range  # of form {"min":-10, "max":0} to specify the size and relative position of window

    data = filter(data, where)

    if not aggregate and not edges:
        if sortColumns:
            data = sort(data, sortColumns)
        # SIMPLE CALCULATED VALUE
        for rownum, r in enumerate(data):
            r[name] = calc_value(r, rownum, data)
        return

    if not aggregate or aggregate == "none":
        for _, values in groupby(data, edges.value):
            if not values:
                continue  # CAN DO NOTHING WITH THIS ZERO-SAMPLE

            sequence = sort(values, sortColumns)

            for rownum, r in enumerate(sequence):
                r[name] = calc_value(r, rownum, sequence)
        return

    for keys, values in groupby(data, edges.value):
        if not values:
            continue  # CAN DO NOTHING WITH THIS ZERO-SAMPLE

        sequence = sort(values, sortColumns)

        for rownum, r in enumerate(sequence):
            r["__temp__"] = calc_value(r, rownum, sequence)

        head = coalesce(_range.max, _range.stop)
        tail = coalesce(_range.min, _range.start)

        # PRELOAD total
        total = aggregate()
        for i in range(tail, head):
            total.add(sequence[i].__temp__)

        # WINDOW FUNCTION APPLICATION
        for i, r in enumerate(sequence):
            r[name] = total.end()
            total.add(sequence[i + head].__temp__)
            total.sub(sequence[i + tail].__temp__)

    for r in data:
        r["__temp__"] = None  # CLEANUP

示例#8

0

显示文件

def list_aggs(frum, query):
    select = listwrap(query.select)

    is_join = False  # True IF MANY TO MANY JOIN WITH AN EDGE
    for e in query.edges:
        if isinstance(e.domain, DefaultDomain):
            e.domain = SimpleSetDomain(
                partitions=list(sorted(set(frum.select(e.value)))))

    for s in listwrap(query.select):
        s["exec"] = qb_expression_to_function(s.value)

    result = {
        s.name: Matrix(dims=[
            len(e.domain.partitions) + (1 if e.allowNulls else 0)
            for e in query.edges
        ],
                       zeros=s.aggregate == "count")
        for s in select
    }
    where = qb_expression_to_function(query.where)
    for d in filter(where, frum):
        d = d.copy()
        coord = [
        ]  # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE
        for e in query.edges:
            coord.append(get_matches(e, d))

        for s in select:
            mat = result[s.name]
            agg = s.aggregate
            var = s.value
            if agg == "count":
                for c in itertools.product(*coord):
                    if var == "." or var == None:
                        mat[c] += 1
                        continue

                    for e, cc in zip(query.edges, c):
                        d[e.name] = cc
                    val = s["exec"](d, c, frum)
                    if val != None:
                        mat[c] += 1
            else:
                for c in itertools.product(*coord):
                    acc = mat[c]
                    if acc == None:
                        acc = windows.name2accumulator.get(agg)
                        if acc == None:
                            Log.error(
                                "select aggregate {{agg}} is not recognized",
                                agg=agg)
                        acc = acc(**s)
                        mat[c] = acc
                    for e, cc in zip(
                            query.edges, c
                    ):  # BECAUSE WE DO NOT KNOW IF s.exec NEEDS THESE EDGES, SO WE PASS THEM ANYWAY
                        d[e.name] = e.domain.partitions[cc]
                    val = s["exec"](d, c, frum)
                    acc.add(val)

    for s in select:
        if s.aggregate == "count":
            continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    output = Cube(select, query.edges, result)
    return output