def main(self): if self.additional_input_expected(): self.argparser.error( 'You must provide an input file or piped data.') if self.args.label_chopped_values and not self.args.chop_length: self.argparser.error( '"-B/--chop-label is an invalid option unless -L/--chop-length is specified' ) self._figure_out_record_marker() maxvallength = self.args.chop_length if maxvallength: valpattern = re.compile(fr'[^\n]{{1,{maxvallength}}}') else: valpattern = re.compile(r'.+') myio = self.init_io(write_header=False) myio.output.writerow(FLAT_COLUMNS['names']) for y, row in enumerate(myio.rows): if self.end_of_record_marker and y > 0: # a "normal" fieldname/value row, in which value is less than maxcolwidth myio.output.writerow((self.end_of_record_marker, None)) for x, fieldname in enumerate(myio.column_names): linevalues = row[x].strip().splitlines() _linecount = 0 for i, value in enumerate(linevalues): if not value: # i.e. a blank new line _linecount += 1 fname = f'{fieldname}~{_linecount}' if self.args.label_chopped_values is True else None myio.output.writerow([fname, ""]) else: chunks = valpattern.findall(value) for j, chunk in enumerate(chunks): if _linecount == 0: fname = fieldname else: fname = f'{fieldname}~{_linecount}' if self.args.label_chopped_values is True else None _linecount += 1 myio.output.writerow([fname, chunk])
def main(self): if self.additional_input_expected(): self.argparser.error('You must provide an input file or piped data.') self.args.columns = self.args.target_column myio = self.init_io(write_header=False) if len(myio.column_ids) != 1: raise ArgumentErrorTK(f"[COLUMN] argument expects exactly one column identifier, not {len(myio.column_ids)} columns: {myio.column_names}") x_cid = myio.column_ids[0] x_cname = myio.column_names[x_cid] pattern = re.compile(self.args.pattern) if pattern.groups == 0: # regex contains no capturing groups, so just one new column # is created with a generic name n_xcols = 1 xcol_names = [f'{x_cname}_{DEFAULT_COL_PREFIX}'] elif pattern.groupindex: # i.e. has named captured groups n_xcols = len(pattern.groupindex) xcol_names = [f'{x_cname}_{k}' for k in pattern.groupindex.keys()] else: # just regular captured groups n_xcols = pattern.groups xcol_names = [f'{x_cname}_{DEFAULT_COL_PREFIX}{i+1}' for i in range(n_xcols)] new_fieldnames = myio.column_names + xcol_names myio.output.writerow(new_fieldnames) for row in myio.rows: xval = row[x_cid] # print('xval', xval) rxmatch = pattern.search(xval) if not rxmatch: newvals = [None for i in range(n_xcols)] elif not rxmatch.groups(): # just a single pattern match, no capture group newvals = [rxmatch.group()] else: newvals = rxmatch.groups() row.extend(newvals) myio.output.writerow(row)
def main(self): if self.additional_input_expected(): self.argparser.error( 'You must provide an input file or piped data.') self.args.columns = self.args.target_column myio = self.init_io(write_header=False) if len(myio.column_ids) != 1: raise ArgumentErrorTK( f"[COLUMN] argument expects exactly one column identifier, not {len(myio.column_ids)} columns: {myio.column_names}" ) x_cid = myio.column_ids[0] x_cname = myio.column_names[x_cid] pattern = re.compile(self.args.pattern) x_delimiter = self.args.xfind_delimiter n_matches = self.args.n_matches if self.args.n_matches != 0 else None # only one column to create xcol_name = f'{x_cname}_{DEFAULT_COL_PREFIX}' new_fieldnames = myio.column_names + [xcol_name] myio.output.writerow(new_fieldnames) for row in myio.rows: xval = row[x_cid] matches = pattern.findall(xval) if not matches: newval = None else: newval = x_delimiter.join(matches[0:n_matches]) row.append(newval) myio.output.writerow(row)
def filter_rows( rows: typeIterable, pattern_str: str, columns_str: str, column_names: list, default_column_ids: list, literal_match: bool, column_offset: int, inverse: bool, any_match: bool, # not_columns, ) -> FilteringCSVReader: if literal_match: pattern = pattern_str else: # literal match pattern = re.compile(pattern_str) if columns_str: expr_col_ids = parse_column_identifiers( columns_str, column_names, column_offset, ) else: expr_col_ids = default_column_ids epatterns = dict((eid, pattern) for eid in expr_col_ids) filtered_rows = FilteringCSVReader( rows, header=False, patterns=epatterns, inverse=inverse, any_match=any_match, ) return filtered_rows
class CSVSed(JustTextUtility): description = """Replaces all instances of [PATTERN] with [REPL]""" override_flags = ["f", "L", "blanks", "date-format", "datetime-format"] def add_arguments(self): self.argparser.add_argument( "-c", "--columns", dest="columns", help= 'A comma separated list of column indices, names or ranges to be searched, e.g. "1,id,3-5".', ) self.argparser.add_argument( "-E", "--expr", dest="expressions_list", # required=True, nargs="*", action="append", type=str, help=r""" When you want to do multiple sed_expressions: -E 'PATTERN' 'REPL' '[names_of_columns]' 'names_of_columns' is a comma-delimited list of columns; it cannot refer to columns *not included* in the `-c/--columns` flag; leave blank to match all columns e.g. -E '(?i)\b(bob|bobby|rob)\b' 'Robert' 'first_name' \ -E '^(?i)smith$' 'SMITH' 'last_name' \ -E '(\d{2})-(\d{3})' '$1:$2' '' \ """, ) self.argparser.add_argument( "-m", "--match-literal", dest="literal_match", action="store_true", default=False, help= "By default, [PATTERN] is assumed to be a regex. Set this flag to make it a literal text find/replace", ) self.argparser.add_argument( "-G", "--like-grep", dest="like_grep", action="store_true", default=False, help= """Only return rows in which [PATTERN] was a match (BEFORE any transformations) – i.e. like grep''s traditional behavior""", ) self.argparser.add_argument( "-R", "--replace", dest="replace_value", action="store_true", default=False, help= "Replace entire field with [REPL], instead of just the substring matched by [PATTERN]", ) self.argparser.add_argument( "--max", dest="max_match_count", action="store", default=0, type=int, help= "Max number of matches to replace PER FIELD. Default is 0, i.e. no limit", ) self.argparser.add_argument( metavar="PATTERN", dest="first_pattern", type=str, # nargs='?', help="A pattern to search for", ) self.argparser.add_argument( metavar="REPL", dest="first_repl", type=str, # nargs='?', help="A replacement pattern", ) self.argparser.add_argument( metavar="FILE", nargs="?", dest="input_path", help= "The CSV file to operate on. If omitted, will accept input as piped data via STDIN.", ) def run(self): """ A wrapper around the main loop of the utility which handles opening and closing files. TK: This is copy-pasted form CSVKitUtil because we have to override 'f'; maybe there's a way to refactor this... csvsed has special functionality, in which the presence of `-E/--expr` changes the command signature, i.e. from: csvsed PATTERN REPL input.csv to: csvsed -E 'PATTERN' 'REPL' 'COLUMNS' -E x y z input.csv """ self.last_expr = [] if not self.args.input_path: # then it must have been eaten by an -E flag; we assume the input file is in last_expr[-1], # where `last_expr` is the last member of expressions_list # TODO: THIS IS CRAP if self.args.expressions_list: self.last_expr = self.args.expressions_list[-1] if len(self.last_expr) > 2: # could be either 3 or 4 self.args.input_path = self.last_expr.pop() elif len(self.last_expr) == 2: pass # do nothing, but be warned that if there is no stdin, # then -E might have eaten up the input_file argument # and interpreted it as pattern else: # else, last_expr has an implied third argument, and # input_path is hopefully stdin self.args.input_path = None # # # error handling # # if self.args.pattern or self.args.repl: # # self.argparser.error("If using -E/--expr, [PATTERN] and [REPL] arguments cannot be filled in") # # if not self.args.input_path and self.args.pattern and not self.args.repl: # # self.args.input_path = self.args.pattern # # delattr(self.args, 'pattern') # # delattr(self.args, 'repl') # # elif self.args.input_path and self.args.pattern: # # # if input_path was given AND self.args.pattern (i.e. any other positional args besides INPUT_PATH) # # self.argparser.error(f"""Got an unexpected positional argument; either: # # - More than 3 arguments for -E/--expr {exes[-1]} # # - Or, a PATTERN argument, which is invalid when using -E/--expr # # """) # # else: # # self.argparser.error("Some other unhandled positional arg thingy [TODO]") # q self.input_file = self._open_input_file(self.args.input_path) try: with warnings.catch_warnings(): if getattr(self.args, "no_header_row", None): warnings.filterwarnings( action="ignore", message="Column names not specified", module="agate", ) self.main() finally: self.input_file.close() def _handle_sed_expressions(self) -> typeList: # TODO: fix this spaghetti CRAP: maybe make expressions handle dicts/named typles instead of lists first_col_str = self.args.columns if self.args.columns else "" first_expr = [ self.args.first_pattern, self.args.first_repl, first_col_str ] expressions = [first_expr] if list_expressions := getattr(self.args, "expressions_list", []): for i, _e in enumerate(list_expressions): ex = _e.copy() if len(ex) < 2 or len(ex) > 3: self.argparser.error( f"-E/--expr takes 2 or 3 arguments; you provided {len(ex)}: {ex}" ) if len(ex) == 2: ex.append(first_col_str) expressions.append(ex) for ex in expressions: # this branch re-loops through the_expressions and fixes any leading dashes in the repls if ex[1][0:2] == r"\-": ex[1] = ex[1][1:] # compile the pattern into a regex if not self.literal_match_mode: ex[0] = re.compile(ex[0]) # set the column_ids ex[2] = parse_column_identifiers(ex[2], self.all_column_names, self.column_offset, None) return expressions