Пример #1
0
    def main(self):
        if self.additional_input_expected():
            self.argparser.error(
                'You must provide an input file or piped data.')

        if self.args.label_chopped_values and not self.args.chop_length:
            self.argparser.error(
                '"-B/--chop-label is an invalid option unless -L/--chop-length is specified'
            )

        self._figure_out_record_marker()

        maxvallength = self.args.chop_length
        if maxvallength:
            valpattern = re.compile(fr'[^\n]{{1,{maxvallength}}}')
        else:
            valpattern = re.compile(r'.+')

        myio = self.init_io(write_header=False)
        myio.output.writerow(FLAT_COLUMNS['names'])

        for y, row in enumerate(myio.rows):
            if self.end_of_record_marker and y > 0:
                # a "normal" fieldname/value row, in which value is less than maxcolwidth
                myio.output.writerow((self.end_of_record_marker, None))

            for x, fieldname in enumerate(myio.column_names):
                linevalues = row[x].strip().splitlines()
                _linecount = 0
                for i, value in enumerate(linevalues):
                    if not value:  # i.e. a blank new line
                        _linecount += 1
                        fname = f'{fieldname}~{_linecount}' if self.args.label_chopped_values is True else None
                        myio.output.writerow([fname, ""])
                    else:
                        chunks = valpattern.findall(value)
                        for j, chunk in enumerate(chunks):
                            if _linecount == 0:
                                fname = fieldname
                            else:
                                fname = f'{fieldname}~{_linecount}' if self.args.label_chopped_values is True else None
                            _linecount += 1
                            myio.output.writerow([fname, chunk])
Пример #2
0
    def main(self):

        if self.additional_input_expected():
            self.argparser.error('You must provide an input file or piped data.')

        self.args.columns = self.args.target_column
        myio = self.init_io(write_header=False)

        if len(myio.column_ids) != 1:
            raise ArgumentErrorTK(f"[COLUMN] argument expects exactly one column identifier, not {len(myio.column_ids)} columns: {myio.column_names}")

        x_cid = myio.column_ids[0]
        x_cname = myio.column_names[x_cid]

        pattern = re.compile(self.args.pattern)

        if pattern.groups == 0:
            # regex contains no capturing groups, so just one new column
            # is created with a generic name
            n_xcols = 1
            xcol_names = [f'{x_cname}_{DEFAULT_COL_PREFIX}']
        elif pattern.groupindex:
            # i.e. has named  captured groups
            n_xcols = len(pattern.groupindex)
            xcol_names = [f'{x_cname}_{k}' for k in pattern.groupindex.keys()]
        else:
            # just regular captured groups
            n_xcols = pattern.groups
            xcol_names = [f'{x_cname}_{DEFAULT_COL_PREFIX}{i+1}' for i in range(n_xcols)]


        new_fieldnames = myio.column_names + xcol_names

        myio.output.writerow(new_fieldnames)

        for row in myio.rows:
            xval = row[x_cid]
            # print('xval', xval)
            rxmatch = pattern.search(xval)
            if not rxmatch:
                newvals = [None for i in range(n_xcols)]
            elif not rxmatch.groups():
                # just a single pattern match, no capture group
                newvals = [rxmatch.group()]
            else:
                newvals = rxmatch.groups()

            row.extend(newvals)
            myio.output.writerow(row)
Пример #3
0
    def main(self):

        if self.additional_input_expected():
            self.argparser.error(
                'You must provide an input file or piped data.')

        self.args.columns = self.args.target_column
        myio = self.init_io(write_header=False)

        if len(myio.column_ids) != 1:
            raise ArgumentErrorTK(
                f"[COLUMN] argument expects exactly one column identifier, not {len(myio.column_ids)} columns: {myio.column_names}"
            )

        x_cid = myio.column_ids[0]
        x_cname = myio.column_names[x_cid]

        pattern = re.compile(self.args.pattern)
        x_delimiter = self.args.xfind_delimiter
        n_matches = self.args.n_matches if self.args.n_matches != 0 else None

        # only one column to create
        xcol_name = f'{x_cname}_{DEFAULT_COL_PREFIX}'

        new_fieldnames = myio.column_names + [xcol_name]

        myio.output.writerow(new_fieldnames)

        for row in myio.rows:
            xval = row[x_cid]
            matches = pattern.findall(xval)
            if not matches:
                newval = None
            else:
                newval = x_delimiter.join(matches[0:n_matches])

            row.append(newval)
            myio.output.writerow(row)
Пример #4
0
def filter_rows(
    rows: typeIterable,
    pattern_str: str,
    columns_str: str,
    column_names: list,
    default_column_ids: list,
    literal_match: bool,
    column_offset: int,
    inverse: bool,
    any_match: bool,
    # not_columns,
) -> FilteringCSVReader:

    if literal_match:
        pattern = pattern_str
    else:  # literal match
        pattern = re.compile(pattern_str)

    if columns_str:
        expr_col_ids = parse_column_identifiers(
            columns_str,
            column_names,
            column_offset,
        )
    else:
        expr_col_ids = default_column_ids

    epatterns = dict((eid, pattern) for eid in expr_col_ids)

    filtered_rows = FilteringCSVReader(
        rows,
        header=False,
        patterns=epatterns,
        inverse=inverse,
        any_match=any_match,
    )
    return filtered_rows
Пример #5
0
class CSVSed(JustTextUtility):
    description = """Replaces all instances of [PATTERN] with [REPL]"""

    override_flags = ["f", "L", "blanks", "date-format", "datetime-format"]

    def add_arguments(self):
        self.argparser.add_argument(
            "-c",
            "--columns",
            dest="columns",
            help=
            'A comma separated list of column indices, names or ranges to be searched, e.g. "1,id,3-5".',
        )

        self.argparser.add_argument(
            "-E",
            "--expr",
            dest="expressions_list",
            # required=True,
            nargs="*",
            action="append",
            type=str,
            help=r"""
                                        When you want to do multiple sed_expressions:
                                            -E 'PATTERN' 'REPL' '[names_of_columns]'

                                            'names_of_columns' is a comma-delimited list of columns; it cannot refer to
                                                columns *not included* in the `-c/--columns` flag; leave blank to match all columns

                                        e.g.
                                        -E '(?i)\b(bob|bobby|rob)\b' 'Robert' 'first_name' \
                                        -E '^(?i)smith$' 'SMITH' 'last_name' \
                                        -E '(\d{2})-(\d{3})' '$1:$2' '' \
                                        """,
        )

        self.argparser.add_argument(
            "-m",
            "--match-literal",
            dest="literal_match",
            action="store_true",
            default=False,
            help=
            "By default, [PATTERN] is assumed to be a regex. Set this flag to make it a literal text find/replace",
        )

        self.argparser.add_argument(
            "-G",
            "--like-grep",
            dest="like_grep",
            action="store_true",
            default=False,
            help=
            """Only return rows in which [PATTERN] was a match (BEFORE any transformations) – i.e. like grep''s traditional behavior""",
        )

        self.argparser.add_argument(
            "-R",
            "--replace",
            dest="replace_value",
            action="store_true",
            default=False,
            help=
            "Replace entire field with [REPL], instead of just the substring matched by [PATTERN]",
        )

        self.argparser.add_argument(
            "--max",
            dest="max_match_count",
            action="store",
            default=0,
            type=int,
            help=
            "Max number of matches to replace PER FIELD. Default is 0, i.e. no limit",
        )

        self.argparser.add_argument(
            metavar="PATTERN",
            dest="first_pattern",
            type=str,
            # nargs='?',
            help="A pattern to search for",
        )

        self.argparser.add_argument(
            metavar="REPL",
            dest="first_repl",
            type=str,
            # nargs='?',
            help="A replacement pattern",
        )

        self.argparser.add_argument(
            metavar="FILE",
            nargs="?",
            dest="input_path",
            help=
            "The CSV file to operate on. If omitted, will accept input as piped data via STDIN.",
        )

    def run(self):
        """
        A wrapper around the main loop of the utility which handles opening and
        closing files.

        TK: This is copy-pasted form CSVKitUtil because we have to override 'f'; maybe there's
            a way to refactor this...

        csvsed has special functionality, in which the presence of `-E/--expr` changes the command signature,
            i.e. from: csvsed PATTERN REPL input.csv
                 to: csvsed -E 'PATTERN' 'REPL' 'COLUMNS' -E x y z input.csv
        """

        self.last_expr = []
        if not self.args.input_path:
            # then it must have been eaten by an -E flag; we assume the input file is in last_expr[-1],
            # where `last_expr` is the last member of expressions_list

            # TODO: THIS IS CRAP
            if self.args.expressions_list:
                self.last_expr = self.args.expressions_list[-1]

                if len(self.last_expr) > 2:
                    # could be either 3 or 4
                    self.args.input_path = self.last_expr.pop()
                elif len(self.last_expr) == 2:
                    pass
                    # do nothing, but be warned that if there is no stdin,
                    # then -E might have eaten up the input_file argument
                    # and interpreted it as pattern
                else:
                    # else, last_expr has an implied third argument, and
                    # input_path is hopefully stdin
                    self.args.input_path = None

            # # # error handling
            # #  if self.args.pattern or self.args.repl:
            # #      self.argparser.error("If using -E/--expr, [PATTERN] and [REPL] arguments cannot be filled in")

            # #  if not self.args.input_path and self.args.pattern and not self.args.repl:
            # #      self.args.input_path = self.args.pattern
            # #      delattr(self.args, 'pattern')
            # #      delattr(self.args, 'repl')
            # #  elif self.args.input_path and self.args.pattern:
            # #      # if input_path was given AND self.args.pattern (i.e. any other positional args besides INPUT_PATH)
            # #      self.argparser.error(f"""Got an unexpected positional argument; either:
            # #          - More than 3 arguments for -E/--expr {exes[-1]}
            # #          - Or, a PATTERN argument, which is invalid when using -E/--expr
            # #      """)
            # #  else:
            # #      self.argparser.error("Some other unhandled positional arg thingy [TODO]")
            # q

        self.input_file = self._open_input_file(self.args.input_path)

        try:
            with warnings.catch_warnings():
                if getattr(self.args, "no_header_row", None):
                    warnings.filterwarnings(
                        action="ignore",
                        message="Column names not specified",
                        module="agate",
                    )

                self.main()
        finally:
            self.input_file.close()

    def _handle_sed_expressions(self) -> typeList:
        # TODO: fix this spaghetti CRAP: maybe make expressions handle dicts/named typles instead of lists

        first_col_str = self.args.columns if self.args.columns else ""
        first_expr = [
            self.args.first_pattern, self.args.first_repl, first_col_str
        ]
        expressions = [first_expr]
        if list_expressions := getattr(self.args, "expressions_list", []):
            for i, _e in enumerate(list_expressions):
                ex = _e.copy()

                if len(ex) < 2 or len(ex) > 3:
                    self.argparser.error(
                        f"-E/--expr takes 2 or 3 arguments; you provided {len(ex)}: {ex}"
                    )

                if len(ex) == 2:
                    ex.append(first_col_str)

                expressions.append(ex)

        for ex in expressions:
            # this branch re-loops through the_expressions and fixes any leading dashes in the repls
            if ex[1][0:2] == r"\-":
                ex[1] = ex[1][1:]

            # compile the pattern into a regex
            if not self.literal_match_mode:
                ex[0] = re.compile(ex[0])

            # set the column_ids
            ex[2] = parse_column_identifiers(ex[2], self.all_column_names,
                                             self.column_offset, None)

        return expressions