Пример #1
0
    def main(self):

        if not self.args.pivot_agg or self.args.pivot_agg == 'list':
            print_available_aggregates(self.output_file)
            return

        self.handle_standard_args()

        agtable = agate.Table.from_csv(self.input_file,
                                       skip_lines=self.args.skip_lines,
                                       sniff_limit=self.args.sniff_limit,
                                       **self.reader_kwargs)
        column_names = agtable.column_names

        _a = parse_aggregate_string_arg(self.args.pivot_agg,
                                        valid_columns=column_names)
        pivot_agg = _a.foo(*_a.args)

        _prow_ids = parse_column_identifiers(
            self.args.pivot_rows,
            column_names,
            column_offset=self.get_column_offset(),
            excluded_columns=None,
        ) if self.args.pivot_rows else None
        pivot_row_names = [column_names[i]
                           for i in _prow_ids] if _prow_ids else None

        _pcol_ids = parse_column_identifiers(
            self.args.pivot_column,
            column_names,
            column_offset=self.get_column_offset(),
            excluded_columns=None,
        ) if self.args.pivot_column else None

        if _pcol_ids and len(_pcol_ids) > 1:
            raise ArgumentErrorTK(
                f'Only one --pivot-column is allowed, not {len(_pcol_ids)}: {_pcol_ids}'
            )
        else:
            pivot_col_name = column_names[_pcol_ids[0]] if _pcol_ids else None
        # print(f"{column_names=}")
        # print(f"{pivot_col_name=}\n{pivot_row_names=}")
        pivot = agtable.pivot(key=pivot_row_names,
                              pivot=pivot_col_name,
                              aggregation=pivot_agg)
        pivot.to_csv(self.output_file, **self.writer_kwargs)
Пример #2
0
    def main(self):
        if self.args.aggregates == ['list'] or self.args.aggregates == ['']:
            print_available_aggregates(self.output_file)
            return

        self.handle_standard_args()

        rawtable = agate.Table.from_csv(self.input_file,
                                        skip_lines=self.args.skip_lines,
                                        sniff_limit=self.args.sniff_limit,
                                        **self.reader_kwargs)
        column_names = rawtable.column_names

        self.aggregates = self.handle_aggregate_args(
            valid_columns=column_names)

        _gcol_ids = parse_column_identifiers(
            self.args.columns,
            column_names,
            column_offset=self.get_column_offset(),
            excluded_columns=None,
        )
        group_colnames = [column_names[i] for i in _gcol_ids]

        # outtable = rawtable.group_by(key=pivot_row_names, pivot=pivot_col_name, aggregation=pivot_agg)

        gtable = rawtable
        for col in group_colnames:
            gtable = gtable.group_by(key=col)

        g_aggs = []
        for a in self.aggregates:
            if a.colname:
                colname = a.colname
            else:
                if a.args:
                    colname = f'{a.foo.__name__}_of_{slugify(a.args)}'
                else:
                    colname = a.foo.__name__
            agg = a.foo(*a.args)
            g_aggs.append((colname, agg))

        xtable = gtable.aggregate(g_aggs)
        xtable.to_csv(self.output_file, **self.writer_kwargs)
Пример #3
0
def filter_rows(
    rows: typeIterable,
    pattern_str: str,
    columns_str: str,
    column_names: list,
    default_column_ids: list,
    literal_match: bool,
    column_offset: int,
    inverse: bool,
    any_match: bool,
    # not_columns,
) -> FilteringCSVReader:

    if literal_match:
        pattern = pattern_str
    else:  # literal match
        pattern = re.compile(pattern_str)

    if columns_str:
        expr_col_ids = parse_column_identifiers(
            columns_str,
            column_names,
            column_offset,
        )
    else:
        expr_col_ids = default_column_ids

    epatterns = dict((eid, pattern) for eid in expr_col_ids)

    filtered_rows = FilteringCSVReader(
        rows,
        header=False,
        patterns=epatterns,
        inverse=inverse,
        any_match=any_match,
    )
    return filtered_rows
Пример #4
0
    def main(self):

        self.statements = self.handle_statements()
        self.handle_standard_args()

        # _input_name = self.args.input_path if self.args.input_path else 'stdin'
        # sys.stderr.write(f'{_input_name=}\n')

        def _build_conditional_foo(column_names, operator, operand,
                                   idnum) -> typeTuple:
            """
            returns:

                'all(row[col] [OPERATOR] [OPERAND] for col in column_names)', {column_names_idnum: column_names operator_idnum=operator, operand_idnum=operand}
            """
            operand_label = f"operand_{idnum}"
            colnames_label = f"column_names_{idnum}"
            funcstr = f"""all(row[col] {operator} {operand_label} for col in {colnames_label})"""
            funclocals = {operand_label: operand, colnames_label: column_names}
            return (funcstr, funclocals)

        rawtable = agate.Table.from_csv(self.input_file,
                                        skip_lines=self.args.skip_lines,
                                        sniff_limit=self.args.sniff_limit,
                                        **self.reader_kwargs)

        column_names = rawtable.column_names

        # _input_name = self.args.input_path if self.args.input_path else 'stdin'
        # sys.stderr.write(f'{_input_name=}\n')

        # sys.stderr.write("Statements:\n")
        # for c in self.statements:
        #     sys.stderr.write(f"\t{c}\n")

        func_str = ""
        func_locals = {}

        for i, state in enumerate(self.statements):
            _col_ids = parse_column_identifiers(
                state['columns'],
                column_names,
                column_offset=self.get_column_offset(),
                excluded_columns=None)
            state_colnames = [column_names[i]
                              for i in _col_ids] if _col_ids else None
            if not state_colnames:
                raise ValueError(
                    f"Did not find valid column names in: {state['columns']}")

            opvalue = _rough_cast_value(state['operand'], state['datatype'])

            fstr, flocals = _build_conditional_foo(state_colnames,
                                                   state['operator'], opvalue,
                                                   i)

            func_str += fstr if state[
                'bool_type'] == 'FIRST' else f" {state['bool_type'].lower()} {fstr}"
            func_locals.update(flocals)

        # sys.stderr.write(f'{func_str=}\n\n')
        # for k, v in func_locals.items():
        #     sys.stderr.write(f'{k}: {v}\n')

        xrows = []

        if rawtable._row_names is not None:
            rrow_names = []
        else:
            rrow_names = None

        for i, row in enumerate(rawtable._rows):
            row_locals = func_locals.copy()
            row_locals.update({'row': row})
            test_row = eval(func_str, row_locals)
            if test_row:
                xrows.append(row)

                if rrow_names is not None:
                    rrow_names.append(rawtable._row_names[i])

        xtable = rawtable._fork(xrows, row_names=rrow_names)
        # xtable = xtable.where(lambda row: )
        #     # xtable = gtable.aggregate(g_aggs)
        xtable.to_csv(self.output_file, **self.writer_kwargs)
Пример #5
0
class CSVSed(JustTextUtility):
    description = """Replaces all instances of [PATTERN] with [REPL]"""

    override_flags = ["f", "L", "blanks", "date-format", "datetime-format"]

    def add_arguments(self):
        self.argparser.add_argument(
            "-c",
            "--columns",
            dest="columns",
            help=
            'A comma separated list of column indices, names or ranges to be searched, e.g. "1,id,3-5".',
        )

        self.argparser.add_argument(
            "-E",
            "--expr",
            dest="expressions_list",
            # required=True,
            nargs="*",
            action="append",
            type=str,
            help=r"""
                                        When you want to do multiple sed_expressions:
                                            -E 'PATTERN' 'REPL' '[names_of_columns]'

                                            'names_of_columns' is a comma-delimited list of columns; it cannot refer to
                                                columns *not included* in the `-c/--columns` flag; leave blank to match all columns

                                        e.g.
                                        -E '(?i)\b(bob|bobby|rob)\b' 'Robert' 'first_name' \
                                        -E '^(?i)smith$' 'SMITH' 'last_name' \
                                        -E '(\d{2})-(\d{3})' '$1:$2' '' \
                                        """,
        )

        self.argparser.add_argument(
            "-m",
            "--match-literal",
            dest="literal_match",
            action="store_true",
            default=False,
            help=
            "By default, [PATTERN] is assumed to be a regex. Set this flag to make it a literal text find/replace",
        )

        self.argparser.add_argument(
            "-G",
            "--like-grep",
            dest="like_grep",
            action="store_true",
            default=False,
            help=
            """Only return rows in which [PATTERN] was a match (BEFORE any transformations) – i.e. like grep''s traditional behavior""",
        )

        self.argparser.add_argument(
            "-R",
            "--replace",
            dest="replace_value",
            action="store_true",
            default=False,
            help=
            "Replace entire field with [REPL], instead of just the substring matched by [PATTERN]",
        )

        self.argparser.add_argument(
            "--max",
            dest="max_match_count",
            action="store",
            default=0,
            type=int,
            help=
            "Max number of matches to replace PER FIELD. Default is 0, i.e. no limit",
        )

        self.argparser.add_argument(
            metavar="PATTERN",
            dest="first_pattern",
            type=str,
            # nargs='?',
            help="A pattern to search for",
        )

        self.argparser.add_argument(
            metavar="REPL",
            dest="first_repl",
            type=str,
            # nargs='?',
            help="A replacement pattern",
        )

        self.argparser.add_argument(
            metavar="FILE",
            nargs="?",
            dest="input_path",
            help=
            "The CSV file to operate on. If omitted, will accept input as piped data via STDIN.",
        )

    def run(self):
        """
        A wrapper around the main loop of the utility which handles opening and
        closing files.

        TK: This is copy-pasted form CSVKitUtil because we have to override 'f'; maybe there's
            a way to refactor this...

        csvsed has special functionality, in which the presence of `-E/--expr` changes the command signature,
            i.e. from: csvsed PATTERN REPL input.csv
                 to: csvsed -E 'PATTERN' 'REPL' 'COLUMNS' -E x y z input.csv
        """

        self.last_expr = []
        if not self.args.input_path:
            # then it must have been eaten by an -E flag; we assume the input file is in last_expr[-1],
            # where `last_expr` is the last member of expressions_list

            # TODO: THIS IS CRAP
            if self.args.expressions_list:
                self.last_expr = self.args.expressions_list[-1]

                if len(self.last_expr) > 2:
                    # could be either 3 or 4
                    self.args.input_path = self.last_expr.pop()
                elif len(self.last_expr) == 2:
                    pass
                    # do nothing, but be warned that if there is no stdin,
                    # then -E might have eaten up the input_file argument
                    # and interpreted it as pattern
                else:
                    # else, last_expr has an implied third argument, and
                    # input_path is hopefully stdin
                    self.args.input_path = None

            # # # error handling
            # #  if self.args.pattern or self.args.repl:
            # #      self.argparser.error("If using -E/--expr, [PATTERN] and [REPL] arguments cannot be filled in")

            # #  if not self.args.input_path and self.args.pattern and not self.args.repl:
            # #      self.args.input_path = self.args.pattern
            # #      delattr(self.args, 'pattern')
            # #      delattr(self.args, 'repl')
            # #  elif self.args.input_path and self.args.pattern:
            # #      # if input_path was given AND self.args.pattern (i.e. any other positional args besides INPUT_PATH)
            # #      self.argparser.error(f"""Got an unexpected positional argument; either:
            # #          - More than 3 arguments for -E/--expr {exes[-1]}
            # #          - Or, a PATTERN argument, which is invalid when using -E/--expr
            # #      """)
            # #  else:
            # #      self.argparser.error("Some other unhandled positional arg thingy [TODO]")
            # q

        self.input_file = self._open_input_file(self.args.input_path)

        try:
            with warnings.catch_warnings():
                if getattr(self.args, "no_header_row", None):
                    warnings.filterwarnings(
                        action="ignore",
                        message="Column names not specified",
                        module="agate",
                    )

                self.main()
        finally:
            self.input_file.close()

    def _handle_sed_expressions(self) -> typeList:
        # TODO: fix this spaghetti CRAP: maybe make expressions handle dicts/named typles instead of lists

        first_col_str = self.args.columns if self.args.columns else ""
        first_expr = [
            self.args.first_pattern, self.args.first_repl, first_col_str
        ]
        expressions = [first_expr]
        if list_expressions := getattr(self.args, "expressions_list", []):
            for i, _e in enumerate(list_expressions):
                ex = _e.copy()

                if len(ex) < 2 or len(ex) > 3:
                    self.argparser.error(
                        f"-E/--expr takes 2 or 3 arguments; you provided {len(ex)}: {ex}"
                    )

                if len(ex) == 2:
                    ex.append(first_col_str)

                expressions.append(ex)

        for ex in expressions:
            # this branch re-loops through the_expressions and fixes any leading dashes in the repls
            if ex[1][0:2] == r"\-":
                ex[1] = ex[1][1:]

            # compile the pattern into a regex
            if not self.literal_match_mode:
                ex[0] = re.compile(ex[0])

            # set the column_ids
            ex[2] = parse_column_identifiers(ex[2], self.all_column_names,
                                             self.column_offset, None)

        return expressions