def runonce(self, num_threads): # Just parse through the data and make sure that we find some words path = "tests/test-pages-articles.xml.bz2" print("Parsing test data") ctx = Wtp(num_threads=num_threads) ret = ctx.process(path, page_cb) titles = collections.defaultdict(int) redirects = collections.defaultdict(int) for title, redirect_to in ret: titles[title] += 1 if redirect_to is not None: redirects[redirect_to] += 1 print("Test data parsing complete") assert sum(redirects.values()) > 0 assert len(titles) > 100 assert all(x == 1 for x in titles.values()) assert len(redirects) > 1
NodeKind.LEVEL4, NodeKind.LEVEL5, NodeKind.LEVEL6): continue if (len(node.args) != 1 or len(node.args[0]) != 1 or not isinstance(node.args[0][0], str)): print(" {} - {}: {}".format(title, node.kind, node.children)) continue t = node.args[0][0] assert isinstance(t, str) print(" {} - {}".format(title, t)) titles.append(t) sys.stdout.flush() return title, titles, ctx.errors ctx = Wtp() ret = ctx.process(path, page_handler) counts = collections.defaultdict(int) titles_ht = collections.defaultdict(list) for page_title, titles, errors in ret: for title in titles: titles_ht[title].append(page_title) for err in errors: msg = err["msg"] counts[msg] += 1 print("=== MOST COMMON ERRORS") errors = list(sorted(counts.items(), key=lambda x: x[1], reverse=True)) for err, cnt in errors[:40]: print(cnt, err) print("=== Saving non-language titles in temp-nonlangs.json")