def index_zim_file(zim_filename, output_dir=".", links_dir=None, index_contents=True, mime_types=DEFAULT_MIME_TYPES, memory_limit=DEFAULT_MEMORY_LIMIT, processors=1, commit_period=DEFAULT_COMMIT_PERIOD, commit_limit=DEFAULT_COMMIT_LIMIT, use_progress_bar=False, **kwargs): zim_obj = ZimFile(zim_filename, cache_size=ZIM_CACHE_SIZE) logger.info("Indexing: %s" % zim_filename) if not index_contents: logger.info("Not indexing article contents") if links_dir != None: logger.debug("Loading links file") links_info = load_links_file(zim_filename, links_dir) if len(links_info) == 0: logger.error("No links loaded from links directory: %s" % links_dir) else: links_info = {} logger.warning("No links directory specified.") # Figure out which mime type indexes from this file we will use logger.debug("All mime type names: %s" % zim_obj.mimeTypeList) logger.info("Using mime types:") mime_type_indexes = [] for mt_re in mime_types: for mt_idx, mt_name in enumerate(zim_obj.mimeTypeList): if re.search(mt_re, mt_name): mime_type_indexes.append(mt_idx) logger.info(mt_name) index_dir = index_directory_path(output_dir, zim_filename) if not os.path.exists(index_dir): logger.debug("Creating index directory: %s" % index_dir) os.mkdir(index_dir) # Don't overwrite an existing index if index.exists_in(index_dir): logger.debug("Loading existing index") ix = index.open_dir(index_dir) searcher = ix.searcher() else: logger.debug("Creating new index") ix = index.create_in(index_dir, get_schema()) searcher = None writer = ix.writer(limitmb=memory_limit, procs=processors) num_articles = zim_obj.header['articleCount'] if use_progress_bar: pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=num_articles).start() else: logger.info("Not using progress bar, will display timestamped occasional updates.") # Counter for when to output occasional updates update_count = 0 last_update = datetime.now() needs_commit = False for idx, article_info in enumerate(article_info_as_unicode(zim_obj.articles())): if use_progress_bar: pbar.update(idx) else: now = datetime.now() if update_count >= commit_limit or now > (last_update + timedelta(seconds=commit_period)): logger.info("%s - %d/%d - %.2f%%" % (now.isoformat(), idx, num_articles, (idx / float(num_articles)) * 100.0 )) update_count = 0 last_update = now if needs_commit: writer.commit() writer = ix.writer(limitmb=memory_limit, procs=processors) needs_commit = False else: update_count += 1 # Skip articles of undesired mime types if article_info['mimetype'] not in mime_type_indexes: continue # Protect read of existing documents as sometimes there # incomplete writes try: if searcher != None: existing = searcher.document(url=article_info['url']) else: existing = None except: logger.exception("Unexpected exception when looking for existing indexed article for index: %d" % idx) existing = None # Skip articles that have already been indexed if existing != None: continue if index_contents: content = content_as_text(zim_obj, article_info, idx) # Whoosh seems to take issue with empty content # and complains about it not being unicode ?! if content != None and len(content.strip()) == 0: content = None else: content = None # Look for forward and backwards links if len(links_info) > 0: article_links = links_info.get(article_info['index'], None) if article_links != None: article_info['reverse_links'] = article_links[0] article_info['forward_links'] = article_links[1] else: logger.debug("No links info found for index: %d" % idx) writer.add_document(content=content, **article_info) needs_commit = True if use_progress_bar: pbar.finish() logger.info("Making final commit") writer.commit() logger.info("Finished")
def verify_indexes(zim_files, index_dir_base, indexed_count_cache=None, verbose=False): missing_indexes = [] empty_indexes = [] complete_indexes = [] incomplete_indexes = [] # Load a dictionary from the index cache file if indexed_count_cache != None: if os.path.exists(indexed_count_cache): logger.debug("Loading existing indexable count cache: %s" % indexed_count_cache) zim_indexable = pickle.load(open(indexed_count_cache, "rb")) else: logger.debug("Opening new indexable count cache: %s" % indexed_count_cache) zim_indexable = {} else: zim_indexable = None for zim_fn in zim_files: index_dir = index_directory_path(index_dir_base, zim_fn) logging.debug("ZIM File: %s" % zim_fn) logging.debug("Index Dir: %s" % index_dir) if not os.path.exists(index_dir): logging.debug("\tIndex is missing\n") missing_indexes.append( (zim_fn, index_dir) ) continue with nested(closing(ZimFile(zim_fn)), closing(open_dir(index_dir))) as (zim_obj, ix): if ix.is_empty(): logger.debug("\tIndex exists but is empty\n") empty_indexes.append( (zim_fn, index_dir) ) continue if zim_indexable != None: # Try to find indexable count from cache since it takes # awhile to compute these and they never change indexed_count = zim_indexable.get(zim_fn, None) if indexed_count == None: mime_type_indexes = [] for mt_re in DEFAULT_MIME_TYPES: for mt_idx, mt_name in enumerate(zim_obj.mimeTypeList): if re.search(mt_re, mt_name): mime_type_indexes.append(mt_idx) indexed_count = 0 logger.debug("Checking indexable against %d articles" % zim_obj.header['articleCount']) for idx in xrange(zim_obj.header['articleCount']): article_info = zim_obj.read_directory_entry_by_index(idx) if article_info['mimetype'] in mime_type_indexes: indexed_count += 1 zim_indexable[zim_fn] = indexed_count # Store cache of indexable items in zim files pickle.dump(zim_indexable, open(indexed_count_cache, "wb")) else: indexed_count = None ix_count = ix.doc_count() zim_count = zim_obj.header['articleCount'] logging.debug("\t%d total in ZIM file" % zim_count) logging.debug("\t%d in index" % ix_count) if indexed_count != None: logging.debug("\t%d indexable in ZIM file" % indexed_count) if ix_count < indexed_count: incomplete_indexes.append( (zim_fn, index_dir) ) logging.debug("\tincomplete index") else: complete_indexes.append( (zim_fn, index_dir) ) logging.debug("\tcomplete index") logger.debug("") # Now report summary information # Now report summary information if len(complete_indexes) > 0: logger.info("----------------------") logger.info("Complete Index Files") logger.info("----------------------") elif zim_indexable != None: logger.info("--------------------------------") logger.info("Completed Indexes Not Computed") logger.info("--------------------------------") for zim_fn, index_dir in complete_indexes: logging.info(zim_fn) if len(incomplete_indexes) > 0: logger.info("----------------------") logger.info("Incomplete Index Files") logger.info("----------------------") elif zim_indexable != None: logger.info("--------------------------------") logger.info("Incompleted Indexes Not Computed") logger.info("--------------------------------") for zim_fn, index_dir in incomplete_indexes: logging.info(zim_fn) if len(missing_indexes) > 0: logger.info("-------------------") logger.info("Missing Index Files") logger.info("-------------------") for zim_fn, index_dir in missing_indexes: logging.info(zim_fn) if len(empty_indexes) > 0: logger.info("--------------") logger.info("Index is Empty") logger.info("--------------") for zim_fn, index_dir in empty_indexes: logging.info(zim_fn)
def verify_indexes(zim_files, index_dir_base, indexed_count_cache=None, verbose=False): missing_indexes = [] empty_indexes = [] complete_indexes = [] incomplete_indexes = [] # Load a dictionary from the index cache file if indexed_count_cache != None: if os.path.exists(indexed_count_cache): logger.debug("Loading existing indexable count cache: %s" % indexed_count_cache) zim_indexable = pickle.load(open(indexed_count_cache, "rb")) else: logger.debug("Opening new indexable count cache: %s" % indexed_count_cache) zim_indexable = {} else: zim_indexable = None for zim_fn in zim_files: index_dir = index_directory_path(index_dir_base, zim_fn) logging.debug("ZIM File: %s" % zim_fn) logging.debug("Index Dir: %s" % index_dir) if not os.path.exists(index_dir): logging.debug("\tIndex is missing\n") missing_indexes.append( (zim_fn, index_dir) ) continue with nested(closing(ZimFile(zim_fn)), closing(open_dir(index_dir))) as (zim_obj, ix): if ix.is_empty(): logger.debug("\tIndex exists but is empty\n") empty_indexes.append( (zim_fn, index_dir) ) continue if zim_indexable != None: # Try to find indexable count from cache since it takes # awhile to compute these and they never change indexed_count = zim_indexable.get(zim_fn, None) if indexed_count == None: mime_type_indexes = [] for mt_re in DEFAULT_MIME_TYPES: for mt_idx, mt_name in enumerate(zim_obj.mimeTypeList): if re.search(mt_re, mt_name): mime_type_indexes.append(mt_idx) indexed_count = 0 logger.debug("Checking indexable against %d articles" % zim_obj.header['articleCount']) for idx in xrange(zim_obj.header['articleCount']): article_info = zim_obj.read_directory_entry_by_index(idx) if article_info['mimetype'] in mime_type_indexes: indexed_count += 1 zim_indexable[zim_fn] = indexed_count # Store cache of indexable items in zim files pickle.dump(zim_indexable, open(indexed_count_cache, "wb")) else: indexed_count = None ix_count = ix.doc_count() zim_count = zim_obj.header['articleCount'] logging.debug("\t%d total in ZIM file" % zim_count) logging.debug("\t%d in index" % ix_count) if indexed_count != None: logging.debug("\t%d indexable in ZIM file" % indexed_count) if ix_count < indexed_count: incomplete_indexes.append( (zim_fn, index_dir) ) logging.debug("\tincomplete index") else: complete_indexes.append( (zim_fn, index_dir) ) logging.debug("\tcomplete index") logger.debug("") # Now report summary information # Now report summary information if len(complete_indexes) > 0: logger.info("----------------------") logger.info("Complete Index Files") logger.info("----------------------") elif zim_indexable != None: logger.info("--------------------------------") logger.info("Completed Indexes Not Computed") logger.info("--------------------------------") for zim_fn, index_dir in complete_indexes: logging.info(zim_fn) if len(incomplete_indexes) > 0: logger.info("----------------------") logger.info("Incomplete Index Files") logger.info("----------------------") elif zim_indexable != None: logger.info("--------------------------------") logger.info("Incomplete Indexes Not Computed") logger.info("--------------------------------") for zim_fn, index_dir in incomplete_indexes: logging.info(zim_fn) if len(missing_indexes) > 0: logger.info("-------------------") logger.info("Missing Index Files") logger.info("-------------------") for zim_fn, index_dir in missing_indexes: logging.info(zim_fn) if len(empty_indexes) > 0: logger.info("--------------") logger.info("Index is Empty") logger.info("--------------") for zim_fn, index_dir in empty_indexes: logging.info(zim_fn)