for ch in "1" + string.lowercase: # '1' for # url = "http://www.noslang.com/{}/{}".format(resource, ch) print("Processing " + url) r = requests.get(url) if not r.ok: print("Skipping {} (status code {})".format(ch, r.status_code), file=sys.stderr) page = html.fromstring(r.text) for abbr in page.cssselect("abbr"): a = abbr.getprevious() definition = abbr.attrib["title"].lower() if definition in fucking_shit: definition = fucking_shit[definition] else: for stars, replacement in fucking_shit.iteritems(): definition = definition.replace(stars, replacement) yield a.attrib["name"].decode("utf-8"), definition if __name__ == "__main__": try: [path] = sys.argv[1:] except ValueError: print("Usage: [prog] path/to/trie", file=sys.stderr) sys.exit(1) abbr = BytesTrie(iter_noslang()) abbr.save(path)
import json import sys from marisa_trie import BytesTrie if __name__ == "__main__": lang = sys.argv[1] print("load mention_stat") with open("./mention_stat_{}.json".format(lang)) as f: data = json.load(f) print("mention_stat to trie") trie = BytesTrie([(k, bytes(json.dumps(v), "utf-8")) for k, v in data.items()]) print("saving...") trie.save("mention_stat_{}.marisa".format(lang)) print("Done!")