Exemplo n.º 1
0
def articles(hosts, index, report=False, dryrun=False, force=False, title=None):
    i = set_hosts_index(hosts=hosts, index=index)
    
    logprint('debug', '------------------------------------------------------------------------')
    # authors need to be refreshed
    logprint('debug', 'getting mw_authors,articles...')
    mw_author_titles = Proxy.authors(cached_ok=False)
    mw_articles = Proxy.articles_lastmod()
    logprint('debug', 'getting es_articles...')
    es_articles = Page.pages()
    logprint('debug', 'mediawiki articles: %s' % len(mw_articles))
    logprint('debug', 'elasticsearch articles: %s' % len(es_articles))
    
    if title:
        articles_update = [title]
    else:
        if force:
            logprint('debug', 'forcibly update all articles')
            articles_update = [page['title'] for page in es_articles]
            articles_delete = []
        else:
            logprint('debug', 'determining new,delete...')
            articles_update,articles_delete = Elasticsearch.articles_to_update(
                mw_author_titles, mw_articles, es_articles)
        logprint('debug', 'articles to update: %s' % len(articles_update))
        #logprint('debug', 'articles to delete: %s' % len(articles_delete))
        if report:
            return
    
    logprint('debug', 'getting encycrg titles...')
    rg_titles = Page.rg_titles()
    logprint('debug', 'encycrg titles: %s' % len(rg_titles))
    if not len(rg_titles):
        logprint('info', 'NO ENCYC-RG ARTICLES!!!')
        logprint('info', 'RUN "encyc articles --force" AFTER THIS PASS TO MARK rg/notrg LINKS')
    
    logprint('debug', 'adding articles...')
    posted = 0
    could_not_post = []
    unpublished = []
    errors = []
    for n,title in enumerate(articles_update):
        logprint('debug', '--------------------')
        logprint('debug', '%s/%s %s' % (n+1, len(articles_update), title))
        logprint('debug', 'getting from mediawiki')
        mwpage = Proxy.page(title, rg_titles=rg_titles)
        try:
            existing_page = Page.get(title)
            logprint('debug', 'exists in elasticsearch')
        except:
            existing_page = None
        
        if (mwpage.published or config.MEDIAWIKI_SHOW_UNPUBLISHED):
            
            logprint('debug', 'creating page')
            page = Page.from_mw(mwpage, page=existing_page)
            if not dryrun:
                logprint('debug', 'saving %s "%s"' % ('articles', page.url_title))
                try:
                    page.save()
                except SerializationError:
                    logprint('error', 'ERROR: Could not serialize to Elasticsearch!')
                try:
                    p = Page.get(title)
                except NotFoundError:
                    logprint('error', 'ERROR: Page(%s) NOT SAVED!' % title)
                    errors.append(title)
                logprint('debug', 'ok')
        
        else:
            # delete from ES if present
            logprint('debug', 'not publishable: %s' % mwpage)
            if existing_page:
                logprint('debug', 'deleting...')
                existing_page.delete()
                unpublished.append(mwpage)
    
    if could_not_post:
        logprint('debug', '========================================================================')
        logprint('debug', 'Could not post these: %s' % could_not_post)
    if unpublished:
        logprint('debug', '========================================================================')
        logprint('debug', 'Unpublished these: %s' % unpublished)
    if errors:
        logprint('info', 'ERROR: %s titles were unpublishable:' % len(errors))
        for title in errors:
            logprint('info', 'ERROR: %s' % title)
    if not len(rg_titles):
        logprint('info', 'NO ENCYC-RG ARTICLES!!!')
        logprint('info', 'RUN "encyc articles --force" AFTER THIS PASS TO MARK rg/notrg LINKS')
        logprint('info', 'NOTE: ENCYC-RG MUST BE ACCESSIBLE IN ORDER TO BUILD RG ARTICLES LIST.')
    logprint('debug', 'DONE')