예제 #1
0
def generate_tv_cloud(search_params,
                      min_length,
                      stopwords,
                      ids=None,
                      stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    burst = True
    chunk_size = 1000
    progress = 0
    wordcloud_counter = Counter()

    if not ids:
        # Normal (non-time line) wordcloud (based on query)
        burst = False

        result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                      search_params['query'],
                                      search_params['dates'],
                                      search_params['distributions'],
                                      search_params['article_types'],
                                      search_params['pillars'])
        doc_count = result.get('count')

        info = {'current': 0, 'total': doc_count}
        current_task.update_state(state='PROGRESS', meta=info)

        for subset in document_id_chunks(
                chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE,
                search_params['query'], search_params['dates'],
                search_params['distributions'], search_params['article_types'],
                search_params['pillars']):

            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE, subset,
                                          min_length, stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {'current': progress, 'total': doc_count}
            current_task.update_state(state='PROGRESS', meta=info)
    else:
        # Time line word cloud (based in list of document ids)
        for subset in utils.chunks(ids, chunk_size):
            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE, subset,
                                          min_length, stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {'current': progress, 'total': len(ids)}
            current_task.update_state(state='PROGRESS', meta=info)

    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
예제 #2
0
파일: tasks.py 프로젝트: johan--/texcavator
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    chunk_size = settings.QUERY_DATA_CHUNK_SIZE
    progress = 0
    wordcloud_counter = Counter()

    # Date range is either provided or from the Query
    dates = date_range or search_params['dates']

    # First, count the search results
    result = count_search_results(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  search_params['query'],
                                  dates,
                                  search_params['exclude_distributions'],
                                  search_params['exclude_article_types'],
                                  search_params['selected_pillars'])
    doc_count = result.get('count')

    info = {
        'current': 0,
        'total': doc_count
    }
    current_task.update_state(state='PROGRESS', meta=info)

    # Then, create the word clouds per chunk
    for subset in document_id_chunks(chunk_size,
                                     settings.ES_INDEX,
                                     settings.ES_DOCTYPE,
                                     search_params['query'],
                                     dates,
                                     search_params['exclude_distributions'],
                                     search_params['exclude_article_types'],
                                     search_params['selected_pillars']):

        result = termvector_wordcloud(settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      subset,
                                      min_length,
                                      stems)
        wordcloud_counter = wordcloud_counter + result

        progress += len(subset)
        info = {
            'current': progress,
            'total': doc_count
        }
        current_task.update_state(state='PROGRESS', meta=info)

    burst = date_range is not None
    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
예제 #3
0
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    burst = True
    chunk_size = 1000
    progress = 0
    wordcloud_counter = Counter()

    if not ids:
        # Normal (non-time line) wordcloud (based on query)
        burst = False

        result = count_search_results(settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      search_params['query'],
                                      search_params['dates'],
                                      search_params['distributions'],
                                      search_params['article_types'],
                                      search_params['pillars'])
        doc_count = result.get('count')

        info = {
            'current': 0,
            'total': doc_count
        }
        current_task.update_state(state='PROGRESS', meta=info)

        for subset in document_id_chunks(chunk_size,
                                         settings.ES_INDEX,
                                         settings.ES_DOCTYPE,
                                         search_params['query'],
                                         search_params['dates'],
                                         search_params['distributions'],
                                         search_params['article_types'],
                                         search_params['pillars']):

            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE,
                                          subset,
                                          min_length,
                                          stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {
                'current': progress,
                'total': doc_count
            }
            current_task.update_state(state='PROGRESS', meta=info)
    else:
        # Time line word cloud (based in list of document ids)
        for subset in utils.chunks(ids, chunk_size):
            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE,
                                          subset,
                                          min_length,
                                          stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {
                'current': progress,
                'total': len(ids)
            }
            current_task.update_state(state='PROGRESS', meta=info)

    return counter2wordclouddata(wordcloud_counter, burst, stopwords)