Пример #1
0
Файл: awk.py Проект: cblp/tabkit
def awk_filter_map(data_desc, filter_strs, map_strs):
    """
    >>> from tabkit.header import parse_header
    >>> awk, desc = awk_filter_map(
    ...     parse_header('# d p e s c m'),
    ...     ['e==157 and (s>100 or s in [15,30,45])'],
    ...     ['ctr=c/s', 'cpm=ctr*m']
    ... )
    >>> print desc
    DataDesc([DataField('ctr', 'any'), DataField('cpm', 'any')])
    >>> print awk.cmd_line()
    LC_ALL=C awk  -F $'\\t' 'BEGIN{OFS="\\t";}{if((($3 == 157) && (($4 > 100) || (($4 == 15) || ($4 == 30) || ($4 == 45))))){ctr = ($5 / $4);print(ctr,(ctr * $6));}}'
    >>> awk, desc = awk_filter_map(parse_header('# a b'), [], ['__all__'])
    >>> print desc
    DataDesc([DataField('a', 'any'), DataField('b', 'any')])
    """
    ctx = ExprContext(data_desc)

    # parse map
    for map_expr_str in map_strs:
        for node in parse(map_expr_str).body:
            if isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__all__':
                for field in data_desc.fields:
                    ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name)))
            elif isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__rest__':
                for field in data_desc.fields:
                    if not ctx.has_var(field.name):
                        ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name)))
            else:
                expr = parse_rowexpr(ctx, node)
                ctx.set_var(expr.target, expr)

    # parse filter
    nodes = [node for filter_str in filter_strs for node in parse(filter_str).body]
    filter_expr = None
    if len(nodes) == 0:
        pass
    elif len(nodes) == 1:
        filter_expr = parse_expr(ctx, nodes[0])
    else:
        filter_expr = RowExprOp('&&', [parse_expr(ctx, node) for node in nodes])

    awk_cmd, output_desc = awk_filter_map_from_context(ctx, filter_expr, data_desc.order)
    if output_desc:
        output_desc.meta = data_desc.meta
    return awk_cmd, output_desc or data_desc
Пример #2
0
def awk_grp(data_desc, key_str, grp_expr_tuples, output_only_assigned=True, expose_groups=False):
    namer = Namer()
    acc_maker = GrpExprFuncMaker('__acc_', namer)
    grp_maker = GrpExprFuncMaker('__grp_', namer)
    key_ctx = ExprContext(data_desc, namer)
    row_ctx = ExprContext(data_desc, namer)
    acc_ctx = ExprContext(DataDesc([],[]), namer)
    grp_ctx = ExprContext(DataDesc([],[]), namer)
    out_ctx = ExprContext(DataDesc([],[]), namer)

    # parse key expr
    keys = []
    key_ins_pos = 0
    for node in parse(key_str or '1').body:
        assigned_name = None
        if isinstance(node, _ast.Assign):
            if len(node.targets) != 1 or not isinstance(node.targets[0], _ast.Name):
                raise Exception('Bad assignment in %r' % (key_str,))
            expr = parse_expr(key_ctx, node.value)
            assigned_name = node.targets[0].id
        else:
            expr = parse_expr(key_ctx, node)
            if output_only_assigned:
                assign = False
            else:
                if isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name):
                    assigned_name = node.value.id
                else:
                    raise Exception('Please assign expression to a variable in %r' % (key_str,))
        key_name = namer.get_name('__key', expr.tostr())
        key_row_name = namer.get_name('__row_key', expr.tostr())
        if assigned_name:
            out_ctx.set_var(
                key_name,
                RowExprAssign(key_name, expr)
            )
            out_ctx.set_var(
                assigned_name,
                RowExprAssign(assigned_name, RowExprVar(out_ctx, key_name)),
                insert_at = key_ins_pos
            )
            key_ins_pos += 1
            grp_ctx.set_var(
                assigned_name,
                RowExprAssign(assigned_name, RowExprVar(out_ctx, key_name)),
            )
        if isinstance(expr, RowExprField): # force str assuming if node is field
            expr = RowExprOp('', [expr, RowExprConst("")])
        keys.append((expr, key_name, key_row_name))

    for grp_type, expr_str in grp_expr_tuples:
        for ast_expr in parse(expr_str).body:
            if grp_type == 'acc':
                expr = parse_assign_grpexpr(acc_ctx, ast_expr, row_ctx, acc_maker)
                acc_ctx.set_var(expr.target, expr)
                out_ctx.set_var(expr.target, expr)
            elif grp_type == 'grp':
                expr = parse_assign_grpexpr(grp_ctx, ast_expr, row_ctx, grp_maker)
                grp_ctx.set_var(expr.target, expr)
                out_ctx.set_var(expr.target, expr)
            else:
                raise Exception('Unknown grouping type %r' % (grp_type,))


    # construct awk script
    print_awk, output_desc = awk_filter_map_from_context(
        out_ctx,
        order = data_desc.order,
    )
    if output_desc is None:
        raise Exception('No output fields specified')
    assert not print_awk.end

    init_grps = AwkBlock()
    init_accs = AwkBlock()
    calc_row_keys = AwkBlock()
    keys_changed = []
    update_keys = AwkBlock()
    update_grps = AwkBlock()
    update_accs = AwkBlock()
    end_grps = AwkBlock()

    for expr, name, row_name in keys:
        calc_row_keys.append(row_name + ' = ' + expr.tostr())
        update_keys.append(name + ' = ' + row_name)
        keys_changed.append(name + '!=' + row_name)

    for name, val in find_grp_funcs(grp_ctx):
        init_grps.append(val.init_str())
        update_grps.extend(val.update_str(recursive=True))
        end_grps.append(val.end_str())

    for name, val in find_grp_funcs(acc_ctx):
        init_accs.append(val.init_str())
        update_accs.extend(val.update_str(recursive=True))
        end_grps.append(val.end_str())

    keys_changed_str = ' || '.join(keys_changed)

    awk = AwkScript(
        begin = (
            print_awk.begin
            + init_grps
            + init_accs
            + AwkBlock(['__print_last = ' + str(int(key_str == None))])
        ),
        end = AwkBlock() if expose_groups else AwkBlock([AwkHeadBlock(
            'if(NR!=0 || __print_last==1)',
            end_grps + print_awk.main
        )]),
        main = (
            calc_row_keys
            + AwkHeadBlock('if(NR==1)', update_keys)
            + AwkHeadBlock('else', AwkBlock([
                AwkHeadBlock('if(' + keys_changed_str + ')',
                    end_grps
                    + (print_awk.main if not expose_groups else AwkBlock())
                    + update_keys
                    + init_grps
                )])
            )
            + update_grps
            + update_accs
            + (print_awk.main if expose_groups else AwkBlock())
        )
    )

    return awk, output_desc