예제 #1
0
def _parse_img_headings(
    page_dir: str, invalidate_cache: bool, language_code: str
) -> None:
    meta_path = join(page_dir, 'img', 'meta.json')
    meta_arr = _getJSON(meta_path)['img_meta']

    if invalidate_cache:
        for m in meta_arr:
            m.pop('headings', None)
    
    text_path = join(page_dir, 'text.json')
    page_html = _getJSON(text_path)['html']
    
    image_headings = _get_image_headings(page_html, language_code)
    for filename, headings in image_headings.items():
        if not _valid_img_type(filename): continue
        if len(headings) == 0: continue
        
        res = [
            i for i, x in enumerate(meta_arr)
            if unquote(x['url']).split('/wiki/')[-1] == filename
        ]

        if len(res) != 1: continue
        i = res[0]

        # TODO: not update when invalidate_cache=False even though we already queried
        meta_arr[i]['headings'] = headings
            
    _dump(meta_path, {"img_meta": meta_arr})
예제 #2
0
def filter_img_metadata(
    data_path: str,
    predicate: Callable[[Dict[str, Any]], bool],
    field_to_remove: str,
    offset: int = 0,
    limit: int = None,
    debug_info: bool = False,
) -> None:
    article_paths = [
        join(data_path, f)
        for f in listdir(data_path) if isdir(join(data_path, f))
    ]

    valid_limit = _validated_limit(limit, offset, len(article_paths))
    for i in range(offset, offset + valid_limit):
        path = article_paths[i]
        if debug_info: print(i, path)
    
        meta_path = join(path, 'img/', 'meta.json')
        meta_arr = _getJSON(meta_path)['img_meta']
        
        meta_arr_filtered = [x for x in meta_arr if predicate(x)]
        for x in meta_arr_filtered:
            # useless fields since now it always the same
            x.pop(field_to_remove, None)
                
        _dump(meta_path, {"img_meta": meta_arr_filtered})
예제 #3
0
def generate_visual_features(
    data_path: str,
    offset: int = 0,
    limit: int = None,
    mapper: IMapper = None,
    invalidate_cache: bool = False,
    debug_info: bool = False,
) -> None:
    article_paths = [
        join(data_path, f) 
        for f in listdir(data_path) if isdir(join(data_path, f))
    ]
    
    valid_limit = _validated_limit(limit, offset, len(article_paths))
    mapper = mapper if mapper else ResNet152Mapper() 
    
    for i in range(offset, offset + valid_limit):
        path = article_paths[i]
        if debug_info: print(i, path)
    
        meta_path = join(path, 'img/', 'meta.json')
        meta_arr = _getJSON(meta_path)['img_meta']
        for meta in meta_arr:
            if 'features' in meta and not invalidate_cache: 
                continue
                
            img_path =  join(path, 'img/', meta['filename'])
            try:
                meta['features'] = mapper.map(img_path)
            except Exception as e:
                print("ERROR: exception for image", img_path, '|||', str(e))
                continue
                
        _dump(meta_path, {"img_meta": meta_arr})
예제 #4
0
def _remove_obsolete_imgs(img_dir: Path, img_links: PageGenerator,
                          params: "QueryParams") -> None:
    uptodate_imgs = [_get_img_path(img, img_dir) for img in img_links]
    icon_removal = params.early_icons_removal
    img_names = ([
        x[1].name
        for x in uptodate_imgs if _valid_img_type(x[0], icon_removal)
    ] + [
        x[2].name
        for x in uptodate_imgs if _valid_img_type(x[0], icon_removal)
    ])

    files = [img_dir / f for f in listdir(img_dir) if isfile(join(img_dir, f))]
    for fpath in files:
        fname = fpath.name
        if (fname not in img_names) and fname[-5:].lower() != ".json":
            print("Removing obsolete image", fpath)
            fpath.unlink()

    meta_path = img_dir / 'meta.json'
    if not meta_path.exists():
        return

    meta = _getJSON(meta_path)
    uptodate_meta = [x for x in meta['img_meta'] if x['filename'] in img_names]
    if len(meta['img_meta']) != len(uptodate_meta):
        print("META", img_dir)
        _dump(meta_path, {"img_meta": uptodate_meta})
예제 #5
0
def _query_img_captions_from_article(
    page_dir: Path,
    invalidate_cache: bool = False,
    language_code: str = 'en',
    debug_info: bool = False,
) -> None:
    meta_path = join(page_dir, 'img', 'meta.json')
    meta_arr = _getJSON(meta_path)['img_meta']

    if invalidate_cache:
        for m in meta_arr:
            m.pop('caption', None)
            m.pop('is_icon', None)

    text_path = join(page_dir, 'text.json')
    page_html = _getJSON(text_path)['html']

    image_captions = _get_image_captions(page_html, language_code, debug_info)
    for filename, caption in image_captions:
        if not _valid_img_type(filename): continue

        res = [
            i for i, x in enumerate(meta_arr)
            if unquote(x['url']).split('/wiki/')[-1] == filename
        ]
        if len(res) != 1:
            if debug_info:                print('WARNING: Meta for page {} is missing the image {}. Either was'\
    ' removed intentionally or cache is outdated'.format(page_dir, filename))
            continue

        i = res[0]
        caption_match_description = (
            ('description' not in meta_arr[i]) or (caption != _remove_prefix(
                meta_arr[i]['description'], "English: ")))

        if 'caption' not in meta_arr[i] and caption_match_description:
            meta_arr[i]['caption'] = caption
            meta_arr[i]['is_icon'] = False  # preview only applies to not-icons

    _dump(meta_path, {"img_meta": meta_arr})
예제 #6
0
def _query_img_captions_from_preview(
    page_dir: Path,
    driver: WebDriver,
    icons: Set[str],
    language_code: str = 'en',
    debug_info: bool = False,
) -> None:
    img_dir = _get_path(page_dir / "img", create_if_not_exists=False)
    meta_path = img_dir / 'meta.json'
    meta_arr = _getJSON(meta_path)['img_meta']

    page_id = basename(page_dir)
    if page_id == '':
        page_id = basename(str(page_dir)[:-1])

    for i, meta in enumerate(meta_arr):
        img_title = meta['title']
        if not _valid_img_type(img_title):
            continue

        file_label = _get_translated_file_label(language_code)
        # TODO: here we extract img_id without language-specific File: prefix
        # and later on we add it again to build a URL. Check whether we could
        # work WITH that language-specific part and thus avoid translations
        img_id = unquote(meta['url']).split('/wiki/{}'.format(file_label))[-1]

        if 'caption' in meta_arr[i]:
            if debug_info: print('Skipping cached caption', img_id)
            continue

        if 'is_icon' in meta_arr[i] and meta_arr[i]['is_icon']:
            if debug_info: print('Skipping known icon', img_id)
            continue

        caption = _parse_caption_with_js(driver, language_code, page_id,
                                         img_id, icons, debug_info)

        if caption is None: icons.add(img_id)
        meta_arr[i].pop('caption', None)
        meta_arr[i]['is_icon'] = (caption is None)

        caption_match_description = (
            ('description' not in meta_arr[i]) or (caption != _remove_prefix(
                meta_arr[i]['description'], "English: ")))

        if caption and caption_match_description:
            meta_arr[i]['caption'] = caption

    _dump(meta_path, {"img_meta": meta_arr})
예제 #7
0
def _is_meta_outdated(meta_path: Path, img_links: PageGenerator,
                      params: "QueryParams") -> bool:
    if not meta_path.exists():
        return True

    if not params.invalidate_cache.oudated_img_meta_cache:
        return False

    meta = _getJSON(meta_path)['img_meta']
    meta_titles = [x['title'] for x in meta]
    current_titles = [
        x.title(with_ns=False) for x in img_links
        if _valid_img_type(x.title(with_ns=False), params.early_icons_removal)
    ]

    res = sorted(meta_titles) != sorted(current_titles)
    if res and params.debug_info: print("OUTDATED META", meta_path)
    return res
예제 #8
0
def tokenize_image_titles(
    data_path: str,
    offset: int = 0,
    limit: int = None,
    invalidate_cache: bool = False,
    debug_info: bool = False,
) -> None:
    article_paths = [
        join(data_path, f) 
        for f in listdir(data_path) if isdir(join(data_path, f))
    ]
    
    valid_limit = _validated_limit(limit, offset, len(article_paths))
    tokenizer = CrazyTokenizer(hashtags='split')
    mapper = str.maketrans({x: '' for x in string.punctuation})
    regex = re.compile(r'(\d+)')

    for i in range(offset, offset + valid_limit):
        path = article_paths[i]
        if debug_info: print(i, path)
    
        meta_path = join(path, 'img/', 'meta.json')
        meta_arr = _getJSON(meta_path)['img_meta']
        for meta in meta_arr:
            if 'parsed_title' in meta and not invalidate_cache:
                continue
                
            filename = os.path.splitext(meta['title'])[0]
            sentence = filename.translate(mapper)
            sentence = regex.sub(r' \g<1> ', sentence)

            tokens = []
            for word in sentence.split():
                tokens += (
                    tokenizer.tokenize("#" + word) 
                    if not word.isdigit() 
                    else [word]
                )
            
            meta['parsed_title'] = " ".join(tokens)
                
        _dump(meta_path, {"img_meta": meta_arr})
예제 #9
0
def query(filename: str, params: QueryParams) -> None:
    site = pywikibot.Site(code=params.language_code,
                          fam='wikipedia',
                          user='******')
    pages = list(
        pagegenerators.TextfilePageGenerator(filename=filename, site=site))
    limit = _validated_limit(params.limit, params.offset, len(pages))

    icons: Set[str] = set()

    # TODO: don't execute driver when fill_captions=Flase
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)

    print('Downloading... offset={}, limit={}'.format(params.offset, limit))
    tc, uc = 0, 0
    for i in range(params.offset, params.offset + limit):
        p = pages[i]
        if p.pageid == 0:
            print("\nERROR: Cannot fetch the page " + p.title())
            continue

        # onyshchak: create_if_not_exists - switch to enrich only existing data
        page_dir = _get_path(
            out_dir=params.out_dir + p.title(as_filename=True).rstrip('.'),
            create_if_not_exists=not params.only_update_cached_pages)

        if not page_dir.exists():
            continue

        if params.debug_info: print('\n{}) {}'.format(i, page_dir))
        should_download_article = lambda path: (not path.exists() or stat(
            path).st_size == 0 or params.invalidate_cache.text_cache)

        text_path = page_dir / 'text.json'
        if should_download_article(text_path):
            if params.debug_info: print("Downloading text.json")
            page_json = {
                "title": p.title(),
                "id": p.pageid,
                "url": p.full_url(),
            }

            if params.fill_property.text_wikitext:
                page_json["wikitext"] = p.text

            if params.fill_property.text_html:
                response = urllib.request.urlopen(p.full_url())
                page_json["html"] = response.read().decode("utf-8")

            _dump(text_path, page_json)

        # downloading page images
        tc, uc = _img_download(p.imagelinks(), page_dir, params, tc, uc)

        if params.fill_property.img_caption:
            _query_img_captions(
                page_dir=page_dir,
                driver=driver,
                icons=icons,
                language_code=params.language_code,
                invalidate_cache=params.invalidate_cache.caption_cache,
                debug_info=params.debug_info,
            )

    print('\nDownloaded {} images, where {} of them unavailable from commons'.
          format(tc, uc))
    driver.quit()

    icons_json = _getJSON(_KNOWN_ICONS_PATH)
    updated_icons = icons.union(icons_json['known_icons'])
    _dump(_KNOWN_ICONS_PATH, {"known_icons": list(updated_icons)})