def __init__(self, lang="en", redirect_map=None, t2id=None, id2t=None, redirect_set=None): if t2id is None: id2t, t2id, redirect_set = load_id2title( 'data/{}wiki/idmap/{}wiki-20170520.id2t'.format(lang, lang)) if redirect_map is None: redirect_map = load_redirects( 'data/{}wiki/idmap/{}wiki-20170520.r2t'.format(lang, lang)) self.null_counts = 0 self.call_counts = 0 self.lang = lang self.redirect_map = redirect_map self.title2id, self.id2title, self.redirect_set = t2id, id2t, redirect_set self.lower2upper = {title.lower(): title for title in self.title2id} for redirect in self.redirect_map: self.lower2upper[redirect.lower()] = self.redirect_map[redirect]
type=str, required=True, help='file to write sorted counts in.') parser.add_argument('--debug', action="store_true", help='interactive') args = parser.parse_args() args = vars(args) # args = docopt("""Count popular entities, after normalizing redirects. # # Usage: # script.py <wikipath> <id2title> <redirect_map> <out> <linksout> [--debug] # # <wikipath> = path to wikipedia text generated from wikiextractor. eg. eswiki/eswiki-20170420_with_links_nodisamb/ # <redirect_map> = tsv file with redirect --> title map. eg. eswiki-20170420_with_links_nodisamb.r2t # <out> = file to write sorted counts in. # <linksout> = file to write surface --> title information. # # Options: # --debug whether to stop after AA for debugging. # """) redirect_map = load_redirects(args["redirects"]) _, t2id, _ = load_id2title(args["id2title"]) p = EntityCounter(wikipath=args["wikitext"], linksout=args["linksout"], contsout=args["contsout"], redirect_map=redirect_map, t2id=t2id, debug=args["debug"], limit=0) p.run()
parser.add_argument('--out', type=str, required=True, help='Directory to dump csv files, e.g., mid') parser.add_argument('--id2t', type=str, required=True, help='id --> title') parser.add_argument('--redirects', type=str, required=True, help='redirect --> title') parser.add_argument('--lang', type=str, required=True, help='language code') parser.add_argument('--window', type=str, required=True, help='context window length') args = parser.parse_args() args = vars(args) redirect2title = load_redirects(args["redirects"]) id2t, t2id, is_redirect_map = load_id2title(args["id2t"]) normalizer = TitleNormalizer(lang=args['lang'], redirect_map=redirect2title, t2id=t2id) create_mids(link_dump_prefix=args["dump"], out=args["out"], encoding="utf-8", lang=args['lang'], window=int(args['window']), normalizer=normalizer)