def __init__(self,
              lang="en",
              redirect_map=None,
              t2id=None,
              id2t=None,
              redirect_set=None):
     if t2id is None:
         id2t, t2id, redirect_set = load_id2title(
             'data/{}wiki/idmap/{}wiki-20170520.id2t'.format(lang, lang))
     if redirect_map is None:
         redirect_map = load_redirects(
             'data/{}wiki/idmap/{}wiki-20170520.r2t'.format(lang, lang))
     self.null_counts = 0
     self.call_counts = 0
     self.lang = lang
     self.redirect_map = redirect_map
     self.title2id, self.id2title, self.redirect_set = t2id, id2t, redirect_set
     self.lower2upper = {title.lower(): title for title in self.title2id}
     for redirect in self.redirect_map:
         self.lower2upper[redirect.lower()] = self.redirect_map[redirect]
                        type=str,
                        required=True,
                        help='file to write sorted counts in.')
    parser.add_argument('--debug', action="store_true", help='interactive')
    args = parser.parse_args()
    args = vars(args)

    # args = docopt("""Count popular entities, after normalizing redirects.
    #
    # Usage:
    #     script.py <wikipath> <id2title> <redirect_map> <out> <linksout> [--debug]
    #
    #     <wikipath> = path to wikipedia text generated from wikiextractor. eg. eswiki/eswiki-20170420_with_links_nodisamb/
    #     <redirect_map> = tsv file with redirect --> title map. eg. eswiki-20170420_with_links_nodisamb.r2t
    #     <out> = file to write sorted counts in.
    #     <linksout> = file to write surface --> title information.
    #
    # Options:
    # --debug  whether to stop after AA for debugging.
    # """)
    redirect_map = load_redirects(args["redirects"])
    _, t2id, _ = load_id2title(args["id2title"])
    p = EntityCounter(wikipath=args["wikitext"],
                      linksout=args["linksout"],
                      contsout=args["contsout"],
                      redirect_map=redirect_map,
                      t2id=t2id,
                      debug=args["debug"],
                      limit=0)
    p.run()
Exemplo n.º 3
0
    parser.add_argument('--out',
                        type=str,
                        required=True,
                        help='Directory to dump csv files, e.g., mid')
    parser.add_argument('--id2t', type=str, required=True, help='id --> title')
    parser.add_argument('--redirects',
                        type=str,
                        required=True,
                        help='redirect --> title')
    parser.add_argument('--lang',
                        type=str,
                        required=True,
                        help='language code')
    parser.add_argument('--window',
                        type=str,
                        required=True,
                        help='context window length')
    args = parser.parse_args()
    args = vars(args)
    redirect2title = load_redirects(args["redirects"])
    id2t, t2id, is_redirect_map = load_id2title(args["id2t"])
    normalizer = TitleNormalizer(lang=args['lang'],
                                 redirect_map=redirect2title,
                                 t2id=t2id)
    create_mids(link_dump_prefix=args["dump"],
                out=args["out"],
                encoding="utf-8",
                lang=args['lang'],
                window=int(args['window']),
                normalizer=normalizer)