def _parse_img_headings( page_dir: str, invalidate_cache: bool, language_code: str ) -> None: meta_path = join(page_dir, 'img', 'meta.json') meta_arr = _getJSON(meta_path)['img_meta'] if invalidate_cache: for m in meta_arr: m.pop('headings', None) text_path = join(page_dir, 'text.json') page_html = _getJSON(text_path)['html'] image_headings = _get_image_headings(page_html, language_code) for filename, headings in image_headings.items(): if not _valid_img_type(filename): continue if len(headings) == 0: continue res = [ i for i, x in enumerate(meta_arr) if unquote(x['url']).split('/wiki/')[-1] == filename ] if len(res) != 1: continue i = res[0] # TODO: not update when invalidate_cache=False even though we already queried meta_arr[i]['headings'] = headings _dump(meta_path, {"img_meta": meta_arr})
def filter_img_metadata( data_path: str, predicate: Callable[[Dict[str, Any]], bool], field_to_remove: str, offset: int = 0, limit: int = None, debug_info: bool = False, ) -> None: article_paths = [ join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f)) ] valid_limit = _validated_limit(limit, offset, len(article_paths)) for i in range(offset, offset + valid_limit): path = article_paths[i] if debug_info: print(i, path) meta_path = join(path, 'img/', 'meta.json') meta_arr = _getJSON(meta_path)['img_meta'] meta_arr_filtered = [x for x in meta_arr if predicate(x)] for x in meta_arr_filtered: # useless fields since now it always the same x.pop(field_to_remove, None) _dump(meta_path, {"img_meta": meta_arr_filtered})
def generate_visual_features( data_path: str, offset: int = 0, limit: int = None, mapper: IMapper = None, invalidate_cache: bool = False, debug_info: bool = False, ) -> None: article_paths = [ join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f)) ] valid_limit = _validated_limit(limit, offset, len(article_paths)) mapper = mapper if mapper else ResNet152Mapper() for i in range(offset, offset + valid_limit): path = article_paths[i] if debug_info: print(i, path) meta_path = join(path, 'img/', 'meta.json') meta_arr = _getJSON(meta_path)['img_meta'] for meta in meta_arr: if 'features' in meta and not invalidate_cache: continue img_path = join(path, 'img/', meta['filename']) try: meta['features'] = mapper.map(img_path) except Exception as e: print("ERROR: exception for image", img_path, '|||', str(e)) continue _dump(meta_path, {"img_meta": meta_arr})
def _remove_obsolete_imgs(img_dir: Path, img_links: PageGenerator, params: "QueryParams") -> None: uptodate_imgs = [_get_img_path(img, img_dir) for img in img_links] icon_removal = params.early_icons_removal img_names = ([ x[1].name for x in uptodate_imgs if _valid_img_type(x[0], icon_removal) ] + [ x[2].name for x in uptodate_imgs if _valid_img_type(x[0], icon_removal) ]) files = [img_dir / f for f in listdir(img_dir) if isfile(join(img_dir, f))] for fpath in files: fname = fpath.name if (fname not in img_names) and fname[-5:].lower() != ".json": print("Removing obsolete image", fpath) fpath.unlink() meta_path = img_dir / 'meta.json' if not meta_path.exists(): return meta = _getJSON(meta_path) uptodate_meta = [x for x in meta['img_meta'] if x['filename'] in img_names] if len(meta['img_meta']) != len(uptodate_meta): print("META", img_dir) _dump(meta_path, {"img_meta": uptodate_meta})
def _query_img_captions_from_article( page_dir: Path, invalidate_cache: bool = False, language_code: str = 'en', debug_info: bool = False, ) -> None: meta_path = join(page_dir, 'img', 'meta.json') meta_arr = _getJSON(meta_path)['img_meta'] if invalidate_cache: for m in meta_arr: m.pop('caption', None) m.pop('is_icon', None) text_path = join(page_dir, 'text.json') page_html = _getJSON(text_path)['html'] image_captions = _get_image_captions(page_html, language_code, debug_info) for filename, caption in image_captions: if not _valid_img_type(filename): continue res = [ i for i, x in enumerate(meta_arr) if unquote(x['url']).split('/wiki/')[-1] == filename ] if len(res) != 1: if debug_info: print('WARNING: Meta for page {} is missing the image {}. Either was'\ ' removed intentionally or cache is outdated'.format(page_dir, filename)) continue i = res[0] caption_match_description = ( ('description' not in meta_arr[i]) or (caption != _remove_prefix( meta_arr[i]['description'], "English: "))) if 'caption' not in meta_arr[i] and caption_match_description: meta_arr[i]['caption'] = caption meta_arr[i]['is_icon'] = False # preview only applies to not-icons _dump(meta_path, {"img_meta": meta_arr})
def _query_img_captions_from_preview( page_dir: Path, driver: WebDriver, icons: Set[str], language_code: str = 'en', debug_info: bool = False, ) -> None: img_dir = _get_path(page_dir / "img", create_if_not_exists=False) meta_path = img_dir / 'meta.json' meta_arr = _getJSON(meta_path)['img_meta'] page_id = basename(page_dir) if page_id == '': page_id = basename(str(page_dir)[:-1]) for i, meta in enumerate(meta_arr): img_title = meta['title'] if not _valid_img_type(img_title): continue file_label = _get_translated_file_label(language_code) # TODO: here we extract img_id without language-specific File: prefix # and later on we add it again to build a URL. Check whether we could # work WITH that language-specific part and thus avoid translations img_id = unquote(meta['url']).split('/wiki/{}'.format(file_label))[-1] if 'caption' in meta_arr[i]: if debug_info: print('Skipping cached caption', img_id) continue if 'is_icon' in meta_arr[i] and meta_arr[i]['is_icon']: if debug_info: print('Skipping known icon', img_id) continue caption = _parse_caption_with_js(driver, language_code, page_id, img_id, icons, debug_info) if caption is None: icons.add(img_id) meta_arr[i].pop('caption', None) meta_arr[i]['is_icon'] = (caption is None) caption_match_description = ( ('description' not in meta_arr[i]) or (caption != _remove_prefix( meta_arr[i]['description'], "English: "))) if caption and caption_match_description: meta_arr[i]['caption'] = caption _dump(meta_path, {"img_meta": meta_arr})
def _is_meta_outdated(meta_path: Path, img_links: PageGenerator, params: "QueryParams") -> bool: if not meta_path.exists(): return True if not params.invalidate_cache.oudated_img_meta_cache: return False meta = _getJSON(meta_path)['img_meta'] meta_titles = [x['title'] for x in meta] current_titles = [ x.title(with_ns=False) for x in img_links if _valid_img_type(x.title(with_ns=False), params.early_icons_removal) ] res = sorted(meta_titles) != sorted(current_titles) if res and params.debug_info: print("OUTDATED META", meta_path) return res
def tokenize_image_titles( data_path: str, offset: int = 0, limit: int = None, invalidate_cache: bool = False, debug_info: bool = False, ) -> None: article_paths = [ join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f)) ] valid_limit = _validated_limit(limit, offset, len(article_paths)) tokenizer = CrazyTokenizer(hashtags='split') mapper = str.maketrans({x: '' for x in string.punctuation}) regex = re.compile(r'(\d+)') for i in range(offset, offset + valid_limit): path = article_paths[i] if debug_info: print(i, path) meta_path = join(path, 'img/', 'meta.json') meta_arr = _getJSON(meta_path)['img_meta'] for meta in meta_arr: if 'parsed_title' in meta and not invalidate_cache: continue filename = os.path.splitext(meta['title'])[0] sentence = filename.translate(mapper) sentence = regex.sub(r' \g<1> ', sentence) tokens = [] for word in sentence.split(): tokens += ( tokenizer.tokenize("#" + word) if not word.isdigit() else [word] ) meta['parsed_title'] = " ".join(tokens) _dump(meta_path, {"img_meta": meta_arr})
def query(filename: str, params: QueryParams) -> None: site = pywikibot.Site(code=params.language_code, fam='wikipedia', user='******') pages = list( pagegenerators.TextfilePageGenerator(filename=filename, site=site)) limit = _validated_limit(params.limit, params.offset, len(pages)) icons: Set[str] = set() # TODO: don't execute driver when fill_captions=Flase options = Options() options.headless = True driver = webdriver.Firefox(options=options) print('Downloading... offset={}, limit={}'.format(params.offset, limit)) tc, uc = 0, 0 for i in range(params.offset, params.offset + limit): p = pages[i] if p.pageid == 0: print("\nERROR: Cannot fetch the page " + p.title()) continue # onyshchak: create_if_not_exists - switch to enrich only existing data page_dir = _get_path( out_dir=params.out_dir + p.title(as_filename=True).rstrip('.'), create_if_not_exists=not params.only_update_cached_pages) if not page_dir.exists(): continue if params.debug_info: print('\n{}) {}'.format(i, page_dir)) should_download_article = lambda path: (not path.exists() or stat( path).st_size == 0 or params.invalidate_cache.text_cache) text_path = page_dir / 'text.json' if should_download_article(text_path): if params.debug_info: print("Downloading text.json") page_json = { "title": p.title(), "id": p.pageid, "url": p.full_url(), } if params.fill_property.text_wikitext: page_json["wikitext"] = p.text if params.fill_property.text_html: response = urllib.request.urlopen(p.full_url()) page_json["html"] = response.read().decode("utf-8") _dump(text_path, page_json) # downloading page images tc, uc = _img_download(p.imagelinks(), page_dir, params, tc, uc) if params.fill_property.img_caption: _query_img_captions( page_dir=page_dir, driver=driver, icons=icons, language_code=params.language_code, invalidate_cache=params.invalidate_cache.caption_cache, debug_info=params.debug_info, ) print('\nDownloaded {} images, where {} of them unavailable from commons'. format(tc, uc)) driver.quit() icons_json = _getJSON(_KNOWN_ICONS_PATH) updated_icons = icons.union(icons_json['known_icons']) _dump(_KNOWN_ICONS_PATH, {"known_icons": list(updated_icons)})