l2 = len(tgt_line) if l1 > l2: l1, l2 = l2, l1 assert l1 <= l2 if l1 < args.min: continue if args.relative: rel = l2/l1 if rel <= args.d: continue else: if l2-l1 <= args.d: continue lines.append(linenr) if args.v > 0: sys.stdout.write("line: %s\n" %(linenr)) if args.v > 1: sys.stdout.write(" src: %s\n" %(" ".join(src_line))) sys.stdout.write(" tgt: %s\n" %(" ".join(tgt_line))) sys.stdout.write("found %s lines\n" %(len(lines))) if args.write: write_numbers(lines, args.write)
in_stream = codecs.getreader("utf-8")(sys.stdin) # read from stdin out_stream = codecs.getwriter("utf-8")(sys.stdout) # write to stdout re_whitespace = re.compile("\s+") strange_lines = [] for linenr, line in enumerate(in_stream): line = line.strip() if args.no_ws: line = re_whitespace.sub(line, " ") # strange_chars = set(line) - chars strange_chars = [c for c in line if not c in chars] if len(strange_chars) > 3: #print strange_chars if args.verbose: out_stream.write(u"line %s offending characters:" % (linenr)) if args.uniq: strange_chars = set(strange_chars) for c in strange_chars: out_stream.write(u" %s (%s)" % (c, repr(c))) out_stream.write(u"\n") out_stream.write(line + u"\n") strange_lines.append(linenr) # print u" ".join(list(linenr, set(line) - chars), " orig:", line sys.stdout.write("found %s lines\n" % (len(strange_lines))) if args.write: write_numbers(strange_lines, args.write)
# chars.update(set(u"")) # quotation chars.update(set(u"“”")) in_stream = codecs.getreader("utf-8")(sys.stdin) # read from stdin out_stream = codecs.getwriter("utf-8")(sys.stdout) # write to stdout strange_lines = [] for linenr, line in enumerate(in_stream): line = line.strip() # strange_chars = set(line) - chars strange_chars = [c for c in line if not c in chars] normal_chars = [c for c in line if c in chars] if len(strange_chars) > args.n: if args.verbose: out_stream.write(u"line %s: %s offending characters: |%s|\n" %(linenr, len(strange_chars),u"|".join(strange_chars))) out_stream.write(line + u"\n") strange_lines.append(linenr) # print u" ".join(list(linenr, set(line) - chars), " orig:", line out_stream.write("%s\n" %(u"".join(normal_chars))) out_stream.flush() # sys.stdout.write("found %s lines\n" %(len(strange_lines))) if args.write: write_numbers(strange_lines, args.write)