def cache_surrounding_pages(query: Query) -> None: """Cache the surrounding pages for the query for the user. Args: query: Query made by a user that should have its next page loaded into the next page cache. """ utils.toggle_myaku_package_log(filename_base='web_worker') utils.toggle_myaku_package_log( filename_base='web_worker', package='search' ) cache_client = NextPageCache() current_page_num = query.page_num if current_page_num < settings.MAX_SEARCH_RESULT_PAGE: query.page_num = current_page_num + 1 with ArticleIndexSearcher() as searcher: forward_page = searcher.search_articles(query) cache_client.set( query.user_id, forward_page, NextPageDirection.FORWARD ) # Don't cache the backward page unless it's > 1 because the page 1 is # always in the first page cache. if current_page_num > 2: query.page_num = current_page_num - 1 with ArticleIndexSearcher() as searcher: backward_page = searcher.search_articles(query) cache_client.set( query.user_id, backward_page, NextPageDirection.BACKWARD )
def main() -> None: """Build a shelf for JMdict data.""" utils.toggle_myaku_package_log(filename_base='build_shelf') # Creating a JapanenTextAnalyzer object will automatically create the # JMdict shelf if it's not already created. JapaneseTextAnalyzer()
def main() -> None: """Run a most recent crawl for the script arg-specified crawlers.""" utils.toggle_myaku_package_log(filename_base=LOG_NAME) stats = CrawlStats() jta = JapaneseTextAnalyzer() scorer = MyakuArticleScorer() crawler_types = parse_crawler_types_arg() for crawler_type in crawler_types: crawl_most_recent(crawler_type, jta, scorer, stats) stats.finish_stat_tracking()
REQUEST_PAGE_NUM_KEY = 'p' MAX_QUERY_LEN = 120 _ARTICLE_LEN_GROUPS = [ (700, 'Short length'), (1200, 'Medium length'), (2000, 'Long length') ] _ARTICLE_LEN_GROUP_MAX_NAME = 'Very long length' _VERY_RECENT_DAYS = 7 # Enable logging for both the myaku package and this search package to the same # files. utils.toggle_myaku_package_log(filename_base='myakuweb') utils.toggle_myaku_package_log(filename_base='myakuweb', package='search') def is_very_recent(dt: datetime) -> bool: """Return True if the datetime is considered very recent.""" days_since_dt = (datetime.utcnow() - dt).days return days_since_dt <= _VERY_RECENT_DAYS def json_serialize_datetime(dt: datetime) -> str: """Serialize a naive datetime to a UTC ISO format string.""" return dt.isoformat(timespec='seconds') + 'Z' class ResourceLink(NamedTuple):
def main() -> None: """Build the full search result first page cache.""" utils.toggle_myaku_package_log(filename_base='build_cache') with ArticleIndexDb() as db, ArticleIndexSearcher() as searcher: build_cache(db, searcher)
cursor = db.article_collection.find(query, {'source_url': 1}) cursor.sort('_id', pymongo.ASCENDING) removed_count = 0 checker = ArticleRemovedChecker() for i, doc in enumerate(cursor): if i % 100 == 0: _log.info('Checked %s\tRemoved %s', i, removed_count) if checker.check_if_404(doc['source_url']): removed_count += 1 result = db.article_collection.update_one( {'_id': doc['_id']}, {'$set': { 'page_removed': True }}) _log.debug('Updated article with _id "%s" as removed: %s', doc['_id'], result.raw_result) else: _log.debug('Article with _id "%s" has not been removed', doc['_id']) if __name__ == '__main__': _log = logging.getLogger('myaku.runners.check_for_removed_articles') utils.toggle_myaku_package_log(filename_base='check_for_removed_articles') try: main() except BaseException: _log.exception('Unhandled exception in main') raise
def main() -> None: """Update the scores of the articles in the crawl db.""" utils.toggle_myaku_package_log(filename_base=LOG_NAME) timer = Timer('rescore') rescore_article_index() timer.stop()