Python Elasticsearch.articles_to_update示例

编程语言: Python

命名空间/包名称: elasticsearch

类/类型: Elasticsearch

方法/功能: articles_to_update

hotexamples.com的示例: 1

Python Elasticsearch.articles_to_update - 已找到1个示例。这些是从开源项目中提取的最受好评的elasticsearch.Elasticsearch.articles_to_update现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Elasticsearch(30)

delete(30)

create(30)

count(30)

exists(30)

info(30)

index(30)

bulk(30)

msearch(30)

mget(30)

get(30)

delete_by_query(30)

get_source(26)

clear_scroll(24)

mlt(5)

__init__(5)

explain(4)

field_stats(3)

flush(2)

field_caps(2)

__hash__(2)

dumping_all_data(2)

delete_script(2)

commit(2)

cursor(2)

indices(2)

close(2)

cleanup(2)

ingest(2)

is_listening(2)

index_bulk(1)

index_get(1)

get_script(1)

index_topics(1)

keys(1)

list_database_names(1)

msearch_template(1)

msgBody(1)

get_settings(1)

exists_source(1)

get_field_mapping(1)

authors_to_update(1)

__str__(1)

_connected(1)

_eland_es_version(1)

_index(1)

_settings(1)

add(1)

articles_to_update(1)

bulk_index(1)

示例#1

显示文件

文件： publish.py 项目： densho/encyc-core

def articles(hosts, index, report=False, dryrun=False, force=False, title=None):
    i = set_hosts_index(hosts=hosts, index=index)
    
    logprint('debug', '------------------------------------------------------------------------')
    # authors need to be refreshed
    logprint('debug', 'getting mw_authors,articles...')
    mw_author_titles = Proxy.authors(cached_ok=False)
    mw_articles = Proxy.articles_lastmod()
    logprint('debug', 'getting es_articles...')
    es_articles = Page.pages()
    logprint('debug', 'mediawiki articles: %s' % len(mw_articles))
    logprint('debug', 'elasticsearch articles: %s' % len(es_articles))
    
    if title:
        articles_update = [title]
    else:
        if force:
            logprint('debug', 'forcibly update all articles')
            articles_update = [page['title'] for page in es_articles]
            articles_delete = []
        else:
            logprint('debug', 'determining new,delete...')
            articles_update,articles_delete = Elasticsearch.articles_to_update(
                mw_author_titles, mw_articles, es_articles)
        logprint('debug', 'articles to update: %s' % len(articles_update))
        #logprint('debug', 'articles to delete: %s' % len(articles_delete))
        if report:
            return
    
    logprint('debug', 'getting encycrg titles...')
    rg_titles = Page.rg_titles()
    logprint('debug', 'encycrg titles: %s' % len(rg_titles))
    if not len(rg_titles):
        logprint('info', 'NO ENCYC-RG ARTICLES!!!')
        logprint('info', 'RUN "encyc articles --force" AFTER THIS PASS TO MARK rg/notrg LINKS')
    
    logprint('debug', 'adding articles...')
    posted = 0
    could_not_post = []
    unpublished = []
    errors = []
    for n,title in enumerate(articles_update):
        logprint('debug', '--------------------')
        logprint('debug', '%s/%s %s' % (n+1, len(articles_update), title))
        logprint('debug', 'getting from mediawiki')
        mwpage = Proxy.page(title, rg_titles=rg_titles)
        try:
            existing_page = Page.get(title)
            logprint('debug', 'exists in elasticsearch')
        except:
            existing_page = None
        
        if (mwpage.published or config.MEDIAWIKI_SHOW_UNPUBLISHED):
            
            logprint('debug', 'creating page')
            page = Page.from_mw(mwpage, page=existing_page)
            if not dryrun:
                logprint('debug', 'saving %s "%s"' % ('articles', page.url_title))
                try:
                    page.save()
                except SerializationError:
                    logprint('error', 'ERROR: Could not serialize to Elasticsearch!')
                try:
                    p = Page.get(title)
                except NotFoundError:
                    logprint('error', 'ERROR: Page(%s) NOT SAVED!' % title)
                    errors.append(title)
                logprint('debug', 'ok')
        
        else:
            # delete from ES if present
            logprint('debug', 'not publishable: %s' % mwpage)
            if existing_page:
                logprint('debug', 'deleting...')
                existing_page.delete()
                unpublished.append(mwpage)
    
    if could_not_post:
        logprint('debug', '========================================================================')
        logprint('debug', 'Could not post these: %s' % could_not_post)
    if unpublished:
        logprint('debug', '========================================================================')
        logprint('debug', 'Unpublished these: %s' % unpublished)
    if errors:
        logprint('info', 'ERROR: %s titles were unpublishable:' % len(errors))
        for title in errors:
            logprint('info', 'ERROR: %s' % title)
    if not len(rg_titles):
        logprint('info', 'NO ENCYC-RG ARTICLES!!!')
        logprint('info', 'RUN "encyc articles --force" AFTER THIS PASS TO MARK rg/notrg LINKS')
        logprint('info', 'NOTE: ENCYC-RG MUST BE ACCESSIBLE IN ORDER TO BUILD RG ARTICLES LIST.')
    logprint('debug', 'DONE')