def articles(hosts, index, report=False, dryrun=False, force=False, title=None): i = set_hosts_index(hosts=hosts, index=index) logprint('debug', '------------------------------------------------------------------------') # authors need to be refreshed logprint('debug', 'getting mw_authors,articles...') mw_author_titles = Proxy.authors(cached_ok=False) mw_articles = Proxy.articles_lastmod() logprint('debug', 'getting es_articles...') es_articles = Page.pages() logprint('debug', 'mediawiki articles: %s' % len(mw_articles)) logprint('debug', 'elasticsearch articles: %s' % len(es_articles)) if title: articles_update = [title] else: if force: logprint('debug', 'forcibly update all articles') articles_update = [page['title'] for page in es_articles] articles_delete = [] else: logprint('debug', 'determining new,delete...') articles_update,articles_delete = Elasticsearch.articles_to_update( mw_author_titles, mw_articles, es_articles) logprint('debug', 'articles to update: %s' % len(articles_update)) #logprint('debug', 'articles to delete: %s' % len(articles_delete)) if report: return logprint('debug', 'getting encycrg titles...') rg_titles = Page.rg_titles() logprint('debug', 'encycrg titles: %s' % len(rg_titles)) if not len(rg_titles): logprint('info', 'NO ENCYC-RG ARTICLES!!!') logprint('info', 'RUN "encyc articles --force" AFTER THIS PASS TO MARK rg/notrg LINKS') logprint('debug', 'adding articles...') posted = 0 could_not_post = [] unpublished = [] errors = [] for n,title in enumerate(articles_update): logprint('debug', '--------------------') logprint('debug', '%s/%s %s' % (n+1, len(articles_update), title)) logprint('debug', 'getting from mediawiki') mwpage = Proxy.page(title, rg_titles=rg_titles) try: existing_page = Page.get(title) logprint('debug', 'exists in elasticsearch') except: existing_page = None if (mwpage.published or config.MEDIAWIKI_SHOW_UNPUBLISHED): logprint('debug', 'creating page') page = Page.from_mw(mwpage, page=existing_page) if not dryrun: logprint('debug', 'saving %s "%s"' % ('articles', page.url_title)) try: page.save() except SerializationError: logprint('error', 'ERROR: Could not serialize to Elasticsearch!') try: p = Page.get(title) except NotFoundError: logprint('error', 'ERROR: Page(%s) NOT SAVED!' % title) errors.append(title) logprint('debug', 'ok') else: # delete from ES if present logprint('debug', 'not publishable: %s' % mwpage) if existing_page: logprint('debug', 'deleting...') existing_page.delete() unpublished.append(mwpage) if could_not_post: logprint('debug', '========================================================================') logprint('debug', 'Could not post these: %s' % could_not_post) if unpublished: logprint('debug', '========================================================================') logprint('debug', 'Unpublished these: %s' % unpublished) if errors: logprint('info', 'ERROR: %s titles were unpublishable:' % len(errors)) for title in errors: logprint('info', 'ERROR: %s' % title) if not len(rg_titles): logprint('info', 'NO ENCYC-RG ARTICLES!!!') logprint('info', 'RUN "encyc articles --force" AFTER THIS PASS TO MARK rg/notrg LINKS') logprint('info', 'NOTE: ENCYC-RG MUST BE ACCESSIBLE IN ORDER TO BUILD RG ARTICLES LIST.') logprint('debug', 'DONE')