def own_latin1_test (inArgs): args = inArgs print("ARGS:", args) print("") a = """ISO8859-1 (Latin-1) text: "c\xE3o vir\xE1 na dire\xE7\xE3o certa, abre a p\xE1gina diz \xD3scar \xE0 \xE9gua!" \xE1\xE9\xED\xF3\xFA \xE0.... \xC1\xC9\xCD\xD3\xDA \xC0.... \xE3..\xF5. \xC3..\xD5. Cedil: \x09\xE7\xC7 """ aList = a.split( "\n" ) for lineStr in aList: s = char_map.simpler_ascii( lineStr ) t = char_map.simpler_ascii( lineStr, 1 ) if s=="": continue print("s:", s) print("t:", t) lastS = s print("") isOk = lastS.strip()=="cC" assert isOk return 0
def dump_file(out, name, do_txc, opts=None) -> int: """ Dump (text-like) file """ if opts is None: opts = DEF_DUMP_OPTS verbose = opts["verbose"] kind = 1 if (do_txc or verbose > 0) else 0 _, data, codex = read_txc(name, do_txc) if do_txc: shown = data.strip() + "\n" else: shown = data if not out: return 0 streamed = out != sys.stdout if opts["encode-out"]: out_encode = opts["encode-out"] streamed = True else: out_encode = codex if opts["simplify"]: if streamed: out.write(char_map.simpler_ascii(shown, kind).encode("ascii")) else: out.write(char_map.simpler_ascii(shown, kind)) else: if streamed: out.write(shown.encode(out_encode)) else: out.write(shown) return 0
def simpler_list(a, sep=None): res = [] if isinstance(a, (list, tuple)): for elem in a: s = char_map.simpler_ascii(elem) res.append(s) else: return char_map.simpler_ascii(a) if sep is None: return res return sep.join(res)
def check_country_accs() -> bool: alist = ibanpt.bank_accounts() idx = 0 for entry in alist: idx += 1 shown = char_map.simpler_ascii(entry) print("\n#{}/{}:\n>>>{}<<<" "".format(idx, len(alist), shown)) assert shown.strip('\n') == shown assert shown.replace(" ", " ") == shown return True
def dig_throu(astr) -> tuple: basic = char_map.simpler_ascii(astr) iban, bank_id, inst_name, type_of = basic.split('\t') check = int(bank_id) assert check >= 0 check = int(iban) kind = " ".join(type_of.split(' ')) assert kind == type_of tup = (iban, bank_id, inst_name) return tup
def _normal_s_value(self, s): if isinstance(s, str): if self.strict_ch is not None: res = char_map.simpler_ascii(s) if res != s: self._add_ref(res, s) else: res = s else: res = s return res
def _convert_to_ids(self, tags) -> dict: dct = { "@id3v2": valid_id3v2(tags), "id3v2": dict(), "id3v2:out": list(), } for akey in tags: if self.is_excluded(akey): continue ukey = char_map.simpler_ascii(akey, 1) # Latin-1 conversion if ukey != akey: dct["id3v2:out"].append((ukey, akey)) continue dct["id3v2"][akey] = tags[akey] return dct
def simple_ascii(s, special=None): """ Similar to simpler_ascii(), but allows a few extra chars. """ if special is None: conv = {0xc5: "A", # A with ring above 0xe5: "a", # a with ring above } else: conv = dict() plain = "" for a_chr in s: to_s = conv.get(ord(a_chr)) if to_s is None: to_s = char_map.simpler_ascii(a_chr) plain += to_s return plain
def simplified(astr) -> str: assert isinstance(astr, str) newstr = astr while True: this = newstr.replace(" ", " ") if this == newstr: break newstr = this res = char_map.simpler_ascii(newstr) if DEBUG > 0: if res != newstr: print(''.join([ f"{char_map.simpler_ascii(ch)}({ord(ch)}d)" for ch in newstr ])) return res
def run_test_cat(notes, opts, d): debug = 1 tap = TaPath(d) if not tap.ok_path(): print("Invalid path:", tap) return 1 if not tap.is_dir(): print("Not a directory:", tap) if tap.path.startswith("../"): pass else: tap.cd_path() print("Dir: {}, abs_path: {}".format(tap, tap.abs_path)) ttb = ttext.TsvBase("any-db") if opts["ext"]: ttb.ext = opts["ext"] rel_names = ttb.scan_tsv(tap.path) tbl = ttb.get_multiple_subnames() assert not tbl fails = ttb.read_files(rel_names, debug=debug) print("ttb.read_files(rel_names={}) returned fails={}".format( rel_names, fails)) if not ttb.names: print("No files found: {} (ext: {}).".format(d, opts["ext"])) return 2 print("Tables:") print(expand_list(ttb.names, "\t- ", 1)) print( expand_list(util.strlist.dict_order(ttb.names, "z")[0], "\t=", post=" (reverse order)\n")) for name in ttb.names: cont = ttb.get_content(name) tbl = ttb.get_table(name) shown = [astr.split("\t") for astr in cont] flown = flow_list(shown) s_str = char_map.simpler_ascii(flown) if s_str != flown: # ...except UnicodeEncodeError (Avoid that!) notes[name] = flown msgs = tbl[3] print("Error msgs ({}): {}\n...\n".format(type(msgs), msgs)) return 0
def dump_import(imp, opts, out=None) -> dict: err = sys.stderr stocks = list() isins, symbs = dict(), dict() filtered = opts["filter"] pre = opts["pre"] if pre: post = "," for row in imp.content: alist = char_map.simpler_ascii(row) tup = alist[4], alist[1], alist[2], alist[0], alist[3] coin, isin, symb = tup[0], tup[1], tup[2] if isin in isins: err.write(f"Duplicate ISIN {isin}: {isins[isin]}\n") continue shown = tup[:-1] if filtered is None or filtered == coin: if out: out.write(f"{pre}{shown}{post}\n") stocks.append(tup) isins[isin] = tup if symb == "-": continue if symb in symbs: err.write(f"Duplicate symbol '{symb}', ISIN {isin}: {symbs[symb]}\n") symbs[symb] = tup res = {"list": stocks, "markets": [], "market-isin": dict(), } for stock in stocks: market = stock[-1] symb = stock[1] if market not in res["markets"]: res["markets"].append(market) mname = short_market_name(market) res["market-isin"][mname] = list() for stock in stocks: mname = short_market_name(stock[-1]) isin = stock[1] res["market-isin"][mname].append(isin) return res
def list_smas(param, ux_find, verbose) -> str: """ List ...Consumos.xlsx """ found = "" where = param if param else ["."] def show(ux_str): print(ux_str[2:]) for path in where: here = "" adir = Dirs(path) for ux_str in adir.uxnames: name = char_map.simpler_ascii(ux_str) if name == ux_find: show(name) if not found: found, here = name, name if verbose > 0: print(f"{path} {ux_find}:", "found" if here else "not found") return found
def dump_text(out, name, opts, debug=0) -> int: """ Dump one text file """ if name.endswith(SPECIAL_TXC): with open(name, "r", encoding=LATIN1_TEXT) as file: data = file.read() print(char_map.simpler_ascii(data)) return 0 tred = BareText(name) if _READ_AS_UTF: is_ok = tred.utf_file_reader() else: is_ok = tred.file_reader() print("tred, ok?{}: {}".format(is_ok, tred)) if is_ok: for line in tred.lines: out.write(line + opts["dosCR"] + "\n") print("Debug:", name) if debug > 0: dump_bare(out, tred) return 0
def show_table(outFile, param, showOpts, debug=0): """ Show table. :param outFile: output stream :param param: parameters :param showOpts: show options :param debug: whether debug is required :return: None, on parameter(s) fault, or an error-code """ code = 0 if param == []: return None cmd, sep, adapt, verbose = showOpts a_path = LPath(param[0]) inName = a_path.to_os_path() assert inName is not None rest = param[1:] z = ZSheets(inName, rest) _, cont = z.sheets, z.cont idx = 0 for pages in cont: idx += 1 y = 0 t = ZTable(pages) for entry in t.cont: y += 1 aStr = t.alt_chr_separated(entry, adapt, sep) s = char_map.simpler_ascii(aStr) pre = "" if verbose <= 0 else "row#{}\t".format(y) isBin = cmd == "cat" if isBin: outFile.write("{}{}\n".format(pre, s).encode("ascii")) else: outFile.write("{}{}\n".format(pre, s)) shown = "{}, {}/ #{}".format(inName, idx, len(cont)) if debug > 0: print("ZTable({}) minCol={}, maxCol={}".format( shown, t.minCol, t.maxCol)) return code
def show_id3_tags(fname, exclude_tags) -> bool: """ Shows id3 tags (raw!) """ aud = mpaudio.Audio(fname) if not aud.has_tag_ids(): print("No tag ids:", fname) return False is_ok = aud.tag_ids() is not None print("Time (seconds):", aud.seconds(), is_ok) if not is_ok: return False for akey in sorted(aud.tag_ids()): item = aud.tag_ids()[akey] skip = exclude_tags and mpaudio.tag_str_within(akey, exclude_tags) != "" newstr = "[skipped]" if skip else item.pprint() shown = char_map.simpler_ascii(newstr, 1) print("akey:", type(item), akey, shown) unused_tags = aud.tag_unused() if unused_tags: print("Unused tags (Latin-1 approximation):", [tag for tag, _ in unused_tags]) return True
def run_main(args): """ Main basic module test. """ aformat = CountryFormats() ibn = IBAN() abbrev = None if args == [] else args[0] print("IBAN format, abbrev='{}': {}" "".format(abbrev, ibn.gen_format(abbrev))) dct = banks.names.BANK_NAMES keys = dct.keys() for bkey in keys: yyyy = bkey[:4] s = banks.names.get_original_name(bkey) if aformat.simple_latin1: original_name = char_map.simpler_ascii(s) else: original_name = s print("Bank code: {} (IBAN yyyy='{}', ccode={}): {}" "".format(bkey, yyyy, bkey[4:], original_name)) num = int(bkey[4:]) valid = num > 0 assert valid assert check_country_accs() return 0
def show_stocks(outFile, param, showOpts, debug): """ Show stocks from Excel file """ _, sep, adapt, verbose = showOpts a_path = LPath(param[0]) inName = a_path.to_os_path() assert inName is not None z = ZSheets(inName) _, cont = z.sheets, z.cont idx = 0 for pages in cont: idx += 1 y = 0 t = ZTable(pages) for entry in t.cont: y += 1 aStr = t.alt_chr_separated(entry, adapt, sep) s = char_map.simpler_ascii(aStr) pre = "" if verbose <= 0 else "row#{}\t".format(y) outFile.write("{}{}\n".format(pre, s)) shown = "{}, {}/ #{}".format(inName, idx, len(cont)) if debug > 0: print("ZTable({}) minCol={}, maxCol={}".format( shown, t.minCol, t.maxCol)) return 0
def nodified(node) -> str: """ Returns a string-ified node. """ shown = char_map.simpler_ascii(node.lines, 1) astr = f"{node.kind}={shown}" return astr
def simpler_str(s, subst_chr="?"): """ Simpler ASCII string """ return char_map.simpler_ascii(s)
def simpler_ascii(a_chr): return char_map.simpler_ascii(a_chr)
def try_markdown(md_file) -> int: """ Try to check pangram at markdown! """ pangram = "" tal = 0 try: file = open(md_file, "r", encoding=LATIN1_TEXT) except FileNotFoundError: file = None if file is None: print("Skipped test (file not there):", md_file) return 2 lines = file.read().splitlines() for line in lines: if line.startswith(">"): pangram = line[1:].strip() break tred = BareText(md_file) #tred.file_reader() tred.add_from_buffer(pangram) hist = tred.histogram shown = char_map.simpler_ascii(pangram) print(f"Pangram (len={len(pangram)}): '{shown}'") for letter in char_map.lowercase(): outras = 0 upper = letter.upper() count = hist.seen[ord(letter)] count += hist.seen[ord(upper)] for oth in range(128, 256): letra = char_map.simpler_ascii(chr(oth)) if letra == letter: outras += hist.seen[oth] print("Letter {}: {} {}{}". format(upper, count, outras, f" (sum: {count+outras})" \ if outras else ""), tal) tal += count + outras count, outras, unconv = 0, 0, [] for letter in pangram: num = ord(letter) letra = char_map.simpler_ascii(letter) if letra.isalpha(): count += 1 outras += int(num >= 128) else: if num >= 128: note = f"symbol={num}d, hex=0x{num:02x}" unconv.append(note) nunc = len(unconv) is_ok = shown == get_pangram("pt") print(f"Letter (all): {tal}+{nunc}, {count+outras} ({count}+{outras})", f"ok? {is_ok}") print(f""" Example: 94 un-accented letters from pangram; 1 unconverted (in this case {nunc}) 107 letters from pangram (including accented): ({count}+{outras}) """) if unconv: print("Unconverted follows:\n" + "\n".join(unconv)) assert is_ok return 0
def dump_wordlist(out, err, whash, opts: dict) -> dict: """ Dumps hash for each word in a file. """ # pylint: disable=line-too-long show_all = bool(opts.get("show-all")) wset = whash.infos.stats() queue, hshing = wset['queue'], wset['hshing'] arange = whash.alpha_number() assert arange >= 10, f"alpha_number() is usually 1000; at least 10; got {arange}" whash.reader() wset = whash.infos.stats() wset['excl'] = whash.excl words = [ wordhash.valid_word(word.rstrip('\n')) for word in whash.lines if not word.startswith('#') ] dct, bysize = dict(), dict() for size in range(3, 7 + 1, 1): bysize[size] = dict() for hsh in range(arange): bysize[size][hsh] = list() for hsh in range(arange): dct[hsh] = list() last = "" wset['nwords'] = len(words) for aword in words: word = char_map.simpler_ascii(aword, 1) s_word = char_map.simpler_ascii(aword) # *if* hsh in (63, 104) and word.startswith("ch") ...; cha~ vs cha' hsh = wordhash.word_hash(s_word) dct[hsh].append((word, s_word)) last = s_word size = len(s_word) if not size: continue if 3 <= size <= 7: bysize[size][hsh].append(s_word) # f"{hsh:>4} {word}\n") #if s_word < last: # err.write(f"Word '{s_word}' is not sorted alphabetically (last was '{last}')\n") fname = whash.fname if not last: err.write(f"Invalid: {fname}\n") return wset for hsh in range(arange): words = dct[hsh] shown = ';'.join([word for word, _ in words]) queue.append((hsh, shown)) info_up, excluded = list(), whash.excl['must'] info_up += [list(), dict()] maxsize, where = -1, 0 # Stats wset['stats-bysize'][0] = 0 for size in range(3, 7 + 1, 1): wset['stats-bysize'][size] = 0 # Main loop for hsh in range(arange): idx = 0 candidates = list() for size in range(3, 7 + 1, 1): words = bysize[size][hsh] if not words: continue rest, up_words = list(), list() for word in words: if word.islower(): if not wordhash.was_excluded(word, excluded, whash.excl): if word not in rest: rest.append(word) elif upperwords.valid_uppercase_word(word): up_words.append((word, 1)) elif word[1:].islower(): up_words.append((word, 1 + len(word))) if not rest: if not up_words: continue for word, up_kind in up_words: tofu = { 'hsh': hsh, 'size': size, 'word': word, 'kind': up_kind, } info_up[0].append(tofu) if hsh in info_up[1]: info_up[1][hsh].append(tofu) else: info_up[1][hsh] = [tofu] continue candidates.append((size, hsh, rest)) if candidates: size, hsh, rest = candidates[0] hshing.append(candidates[0]) wset['stats-bysize'][size] += 1 idx = size if idx > maxsize: maxsize, where = idx, hsh if idx <= 0: hshing.append((0, hsh, ["(NADA)"])) #out.write(f"bysize:- {hsh:>4} (NADA)\n") wset['stats-bysize'][0] += 1 wset['hsh-capital'] = info_up wset['where'], wset['maxsize'] = where, maxsize wset['wthere'] = dict( ) # wset['hshing'][915] = ['word1', 'word2', ...]; here indexes a word to its hash msg = iterate_wording(wset['hshing'], wset['wthere']) assert msg == "", msg word_sub_info_upper(info_up, wset) if show_all: word_subcalc(out, whash, hshing, wset) return wset
def _from_fname(self, astr): """ Returns the applicable header string from filename """ res = char_map.simpler_ascii(astr) return res