def handle(self, *args, **options): n_document_ids = 100000 if len(args) > 0: n_document_ids = int(args[0]) match_all = {'query': {'match_all': {}}} total_docs = _es().count(settings.ES_INDEX, settings.ES_DOCTYPE, match_all).get('count', 0) if n_document_ids > total_docs: n_document_ids = total_docs # Empty database DocID.objects.all().delete() self.stdout.write('Retrieving {num} document ids...'. format(num=n_document_ids)) fields = [] get_more_docs = True start = 0 num = 2500 num_retrieved = 0 while get_more_docs: doc_ids = [] results = _es().search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=match_all, fields=fields, from_=start, size=num) for result in results['hits']['hits']: num_retrieved = num_retrieved + 1 d = DocID(doc_id=result['_id']) doc_ids.append(d) if num_retrieved == n_document_ids: get_more_docs = False if num_retrieved % 1000 == 0: self.stdout.write('. ', ending='') self.stdout.flush() DocID.objects.bulk_create(doc_ids) start = start + num self.stdout.write('')
def handle(self, *args, **options): n_document_ids = 100000 if len(args) > 0: n_document_ids = int(args[0]) match_all = {'query': {'match_all': {}}} total_docs = _es().count(settings.ES_INDEX, settings.ES_DOCTYPE, match_all).get('count', 0) if n_document_ids > total_docs: n_document_ids = total_docs # Empty database DocID.objects.all().delete() self.stdout.write( 'Retrieving {num} document ids...'.format(num=n_document_ids)) fields = [] get_more_docs = True start = 0 num = 2500 num_retrieved = 0 while get_more_docs: doc_ids = [] results = _es().search(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=match_all, fields=fields, from_=start, size=num) for result in results['hits']['hits']: num_retrieved = num_retrieved + 1 d = DocID(doc_id=result['_id']) doc_ids.append(d) if num_retrieved == n_document_ids: get_more_docs = False if num_retrieved % 1000 == 0: self.stdout.write('. ', ending='') self.stdout.flush() DocID.objects.bulk_create(doc_ids) start = start + num self.stdout.write('')
def handle(self, *args, **options): query_size = 10000 es_retrieve = 2500 if len(args) > 0: query_size = int(args[0]) if DocID.objects.all().count() == 0: print 'Document ids must be gathered before query terms can be ' \ 'extracted. \n Please execute python manage.py gatherdocids' sys.exit(1) # Empty database QueryTerm.objects.all().delete() self.stdout.write('Retrieving {} documents...'.format(query_size)) terms = set() # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] for ids in utils.chunks(doc_ids, es_retrieve): bdy = { 'ids': ids, 'parameters': { 'fields': ['article_dc_title'], 'term_statistics': False, 'field_statistics': False, 'offsets': False, 'payloads': False, 'positions': False } } t_vectors = _es().mtermvectors(index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, body=bdy) for doc in t_vectors.get('docs'): for field, data in doc.get('term_vectors').iteritems(): for term, details in data.get('terms').iteritems(): t = term.encode('ascii', 'replace') if len(t) <= 26: terms.add(QueryTerm(t)) # save to database print 'Saving {} terms to the database.'.format(len(terms)) QueryTerm.objects.bulk_create(terms)
def handle(self, *args, **options): query_size = 2500 n_repetitions = 10 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) response_times = [] for repetition in range(n_repetitions): c1 = time.time() es_time = [] # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] if len(doc_ids) == 0: print 'No document ids found.\nPlease run the gatherdocids ' \ 'command first.\n\n' query = { "query": { "filtered": { "filter": { "ids": { "values": doc_ids } } } }, "aggs": { "words": { "terms": { "field": "tags", "size": 100 } } }, "size": 0 } c3 = time.time() result = _es().search(index='kb', doc_type='doc', body=query) wordcloud = result.get('aggregations').get('words').get('buckets') c4 = time.time() c2 = time.time() elapsed_c = (c2 - c1) * 1000 response_times.append(elapsed_c) es_time.append((c4 - c3) * 1000) self.stdout.write( str(elapsed_c) + ' ES: ' + str(sum(es_time)) + ' #results: ' + str(len(wordcloud))) self.stdout.flush() avg = float(sum(response_times) / len(response_times)) print 'Average response time for generating word clouds from {num} ' \ 'documents: {avg} miliseconds'.format(num=query_size, avg=avg)
def handle(self, *args, **options): query_size = 2500 n_repetitions = 10 es_retrieve = 2500 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) if len(args) > 2: es_retrieve = int(args[2]) response_times = [] for repetition in range(n_repetitions): c1 = time.time() es_time = [] wordcloud = Counter() # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] for ids in utils.chunks(doc_ids, es_retrieve): bdy = { 'ids': ids, 'parameters': { 'fields': ['article_dc_title', 'text_content'], 'term_statistics': False, 'field_statistics': False, 'offsets': False, 'payloads': False, 'positions': False } } c3 = time.time() t_vectors = _es().mtermvectors(index='kb', doc_type='doc', body=bdy) c4 = time.time() es_time.append((c4-c3)*1000) for doc in t_vectors.get('docs'): for field, data in doc.get('term_vectors').iteritems(): temp = {} for term, details in data.get('terms').iteritems(): temp[term] = int(details['term_freq']) wordcloud.update(temp) c2 = time.time() elapsed_c = (c2-c1)*1000 response_times.append(elapsed_c) self.stdout.write(str(elapsed_c)+' ES: '+str(sum(es_time))) self.stdout.flush() avg = float(sum(response_times)/len(response_times)) print 'Average response time for generating word clouds from {num} ' \ 'documents: {avg} miliseconds'.format(num=query_size, avg=avg)
def handle(self, *args, **options): query_size = 2500 n_repetitions = 10 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) response_times = [] for repetition in range(n_repetitions): c1 = time.time() es_time = [] # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] if len(doc_ids) == 0: print 'No document ids found.\nPlease run the gatherdocids ' \ 'command first.\n\n' query = { "query": { "filtered": { "filter": { "ids": { "values": doc_ids } } } }, "aggs": { "words": { "terms": { "field": "tags", "size": 100 } } }, "size": 0 } c3 = time.time() result = _es().search(index='kb', doc_type='doc', body=query) wordcloud = result.get('aggregations').get('words').get('buckets') c4 = time.time() c2 = time.time() elapsed_c = (c2-c1)*1000 response_times.append(elapsed_c) es_time.append((c4-c3)*1000) self.stdout.write(str(elapsed_c)+' ES: '+str(sum(es_time)) + ' #results: '+str(len(wordcloud))) self.stdout.flush() avg = float(sum(response_times)/len(response_times)) print 'Average response time for generating word clouds from {num} ' \ 'documents: {avg} miliseconds'.format(num=query_size, avg=avg)