def test_chunks(): results = [chunk for chunk in utils.chunks([], 100)] assert_equals(results, []) results = [chunk for chunk in utils.chunks(range(5), 1)] assert_equals(results, [[0], [1], [2], [3], [4]]) results = [chunk for chunk in utils.chunks(range(5), 2)] assert_equals(results, [[0, 1], [2, 3], [4]])
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False): """Generates multiple document word clouds using the termvector approach""" burst = True chunk_size = 1000 progress = 0 wordcloud_counter = Counter() if not ids: # Normal (non-time line) wordcloud (based on query) burst = False result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']) doc_count = result.get('count') info = {'current': 0, 'total': doc_count} current_task.update_state(state='PROGRESS', meta=info) for subset in document_id_chunks( chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = {'current': progress, 'total': doc_count} current_task.update_state(state='PROGRESS', meta=info) else: # Time line word cloud (based in list of document ids) for subset in utils.chunks(ids, chunk_size): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = {'current': progress, 'total': len(ids)} current_task.update_state(state='PROGRESS', meta=info) return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def handle(self, *args, **options): query_size = 10000 es_retrieve = 2500 if len(args) > 0: query_size = int(args[0]) if DocID.objects.all().count() == 0: print 'Document ids must be gathered before query terms can be ' \ 'extracted. \n Please execute python manage.py gatherdocids' sys.exit(1) # Empty database QueryTerm.objects.all().delete() self.stdout.write('Retrieving {} documents...'.format(query_size)) terms = set() # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] for ids in utils.chunks(doc_ids, es_retrieve): bdy = { 'ids': ids, 'parameters': { 'fields': ['article_dc_title'], 'term_statistics': False, 'field_statistics': False, 'offsets': False, 'payloads': False, 'positions': False } } t_vectors = _es().mtermvectors(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=bdy) for doc in t_vectors.get('docs'): for field, data in doc.get('term_vectors').iteritems(): for term, details in data.get('terms').iteritems(): t = term.encode('ascii', 'replace') if len(t) <= 26: terms.add(QueryTerm(t)) # save to database print 'Saving {} terms to the database.'.format(len(terms)) QueryTerm.objects.bulk_create(terms)
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False): """Generates multiple document word clouds using the termvector approach""" burst = True chunk_size = 1000 progress = 0 wordcloud_counter = Counter() if not ids: # Normal (non-time line) wordcloud (based on query) burst = False result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']) doc_count = result.get('count') info = { 'current': 0, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) for subset in document_id_chunks(chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) else: # Time line word cloud (based in list of document ids) for subset in utils.chunks(ids, chunk_size): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': len(ids) } current_task.update_state(state='PROGRESS', meta=info) return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def handle(self, *args, **options): query_size = 2500 n_repetitions = 10 es_retrieve = 2500 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) if len(args) > 2: es_retrieve = int(args[2]) response_times = [] for repetition in range(n_repetitions): c1 = time.time() es_time = [] wordcloud = Counter() # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] for ids in utils.chunks(doc_ids, es_retrieve): bdy = { 'ids': ids, 'parameters': { 'fields': ['article_dc_title', 'text_content'], 'term_statistics': False, 'field_statistics': False, 'offsets': False, 'payloads': False, 'positions': False } } c3 = time.time() t_vectors = _es().mtermvectors(index='kb', doc_type='doc', body=bdy) c4 = time.time() es_time.append((c4-c3)*1000) for doc in t_vectors.get('docs'): for field, data in doc.get('term_vectors').iteritems(): temp = {} for term, details in data.get('terms').iteritems(): temp[term] = int(details['term_freq']) wordcloud.update(temp) c2 = time.time() elapsed_c = (c2-c1)*1000 response_times.append(elapsed_c) self.stdout.write(str(elapsed_c)+' ES: '+str(sum(es_time))) self.stdout.flush() avg = float(sum(response_times)/len(response_times)) print 'Average response time for generating word clouds from {num} ' \ 'documents: {avg} miliseconds'.format(num=query_size, avg=avg)