def handle(self, *args, **options):
        n_document_ids = 100000
        if len(args) > 0:
            n_document_ids = int(args[0])

        match_all = {'query': {'match_all': {}}}

        total_docs = _es().count(settings.ES_INDEX,
                                 settings.ES_DOCTYPE,
                                 match_all).get('count', 0)

        if n_document_ids > total_docs:
            n_document_ids = total_docs

        # Empty database
        DocID.objects.all().delete()

        self.stdout.write('Retrieving {num} document ids...'.
                          format(num=n_document_ids))

        fields = []
        get_more_docs = True
        start = 0
        num = 2500
        num_retrieved = 0

        while get_more_docs:
            doc_ids = []
            results = _es().search(index=settings.ES_INDEX,
                                   doc_type=settings.ES_DOCTYPE,
                                   body=match_all,
                                   fields=fields,
                                   from_=start,
                                   size=num)
            for result in results['hits']['hits']:
                num_retrieved = num_retrieved + 1
                d = DocID(doc_id=result['_id'])
                doc_ids.append(d)

                if num_retrieved == n_document_ids:
                    get_more_docs = False

                if num_retrieved % 1000 == 0:
                    self.stdout.write('. ', ending='')
                    self.stdout.flush()

            DocID.objects.bulk_create(doc_ids)
            start = start + num
        self.stdout.write('')
Пример #2
0
    def handle(self, *args, **options):
        n_document_ids = 100000
        if len(args) > 0:
            n_document_ids = int(args[0])

        match_all = {'query': {'match_all': {}}}

        total_docs = _es().count(settings.ES_INDEX, settings.ES_DOCTYPE,
                                 match_all).get('count', 0)

        if n_document_ids > total_docs:
            n_document_ids = total_docs

        # Empty database
        DocID.objects.all().delete()

        self.stdout.write(
            'Retrieving {num} document ids...'.format(num=n_document_ids))

        fields = []
        get_more_docs = True
        start = 0
        num = 2500
        num_retrieved = 0

        while get_more_docs:
            doc_ids = []
            results = _es().search(index=settings.ES_INDEX,
                                   doc_type=settings.ES_DOCTYPE,
                                   body=match_all,
                                   fields=fields,
                                   from_=start,
                                   size=num)
            for result in results['hits']['hits']:
                num_retrieved = num_retrieved + 1
                d = DocID(doc_id=result['_id'])
                doc_ids.append(d)

                if num_retrieved == n_document_ids:
                    get_more_docs = False

                if num_retrieved % 1000 == 0:
                    self.stdout.write('. ', ending='')
                    self.stdout.flush()

            DocID.objects.bulk_create(doc_ids)
            start = start + num
        self.stdout.write('')
Пример #3
0
    def handle(self, *args, **options):
        query_size = 10000
        es_retrieve = 2500

        if len(args) > 0:
            query_size = int(args[0])

        if DocID.objects.all().count() == 0:
            print 'Document ids must be gathered before query terms can be ' \
                  'extracted. \n Please execute python manage.py gatherdocids'
            sys.exit(1)

        # Empty database
        QueryTerm.objects.all().delete()

        self.stdout.write('Retrieving {} documents...'.format(query_size))

        terms = set()

        # select random documents
        document_set = DocID.objects.order_by('?')[0:query_size]
        doc_ids = [doc.doc_id for doc in document_set]

        for ids in utils.chunks(doc_ids, es_retrieve):
            bdy = {
                'ids': ids,
                'parameters': {
                    'fields': ['article_dc_title'],
                    'term_statistics': False,
                    'field_statistics': False,
                    'offsets': False,
                    'payloads': False,
                    'positions': False
                }
            }

            t_vectors = _es().mtermvectors(index=settings.ES_INDEX,
                                           doc_type=settings.ES_DOCTYPE,
                                           body=bdy)

            for doc in t_vectors.get('docs'):
                for field, data in doc.get('term_vectors').iteritems():
                    for term, details in data.get('terms').iteritems():
                        t = term.encode('ascii', 'replace')
                        if len(t) <= 26:
                            terms.add(QueryTerm(t))

        # save to database
        print 'Saving {} terms to the database.'.format(len(terms))

        QueryTerm.objects.bulk_create(terms)
Пример #4
0
    def handle(self, *args, **options):
        query_size = 2500
        n_repetitions = 10

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])

        response_times = []

        for repetition in range(n_repetitions):
            c1 = time.time()
            es_time = []

            # select random documents
            document_set = DocID.objects.order_by('?')[0:query_size]
            doc_ids = [doc.doc_id for doc in document_set]

            if len(doc_ids) == 0:
                print 'No document ids found.\nPlease run the gatherdocids ' \
                      'command first.\n\n'

            query = {
                "query": {
                    "filtered": {
                        "filter": {
                            "ids": {
                                "values": doc_ids
                            }
                        }
                    }
                },
                "aggs": {
                    "words": {
                        "terms": {
                            "field": "tags",
                            "size": 100
                        }
                    }
                },
                "size": 0
            }

            c3 = time.time()
            result = _es().search(index='kb', doc_type='doc', body=query)
            wordcloud = result.get('aggregations').get('words').get('buckets')
            c4 = time.time()

            c2 = time.time()

            elapsed_c = (c2 - c1) * 1000
            response_times.append(elapsed_c)
            es_time.append((c4 - c3) * 1000)

            self.stdout.write(
                str(elapsed_c) + ' ES: ' + str(sum(es_time)) + ' #results: ' +
                str(len(wordcloud)))
            self.stdout.flush()

        avg = float(sum(response_times) / len(response_times))
        print 'Average response time for generating word clouds from {num} ' \
              'documents: {avg} miliseconds'.format(num=query_size, avg=avg)
Пример #5
0
    def handle(self, *args, **options):
        query_size = 2500
        n_repetitions = 10
        es_retrieve = 2500

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])
        if len(args) > 2:
            es_retrieve = int(args[2])

        response_times = []

        for repetition in range(n_repetitions):
            c1 = time.time()
            es_time = []

            wordcloud = Counter()

            # select random documents
            document_set = DocID.objects.order_by('?')[0:query_size]
            doc_ids = [doc.doc_id for doc in document_set]

            for ids in utils.chunks(doc_ids, es_retrieve):

                bdy = {
                    'ids': ids,
                    'parameters': {
                        'fields': ['article_dc_title', 'text_content'],
                        'term_statistics': False,
                        'field_statistics': False,
                        'offsets': False,
                        'payloads': False,
                        'positions': False

                    }
                }

                c3 = time.time()
                t_vectors = _es().mtermvectors(index='kb', doc_type='doc',
                                               body=bdy)
                c4 = time.time()

                es_time.append((c4-c3)*1000)

                for doc in t_vectors.get('docs'):
                    for field, data in doc.get('term_vectors').iteritems():
                        temp = {}
                        for term, details in data.get('terms').iteritems():
                            temp[term] = int(details['term_freq'])
                        wordcloud.update(temp)

            c2 = time.time()

            elapsed_c = (c2-c1)*1000
            response_times.append(elapsed_c)
            self.stdout.write(str(elapsed_c)+' ES: '+str(sum(es_time)))
            self.stdout.flush()

        avg = float(sum(response_times)/len(response_times))
        print 'Average response time for generating word clouds from {num} ' \
              'documents: {avg} miliseconds'.format(num=query_size, avg=avg)
    def handle(self, *args, **options):
        query_size = 2500
        n_repetitions = 10

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])

        response_times = []

        for repetition in range(n_repetitions):
            c1 = time.time()
            es_time = []

            # select random documents
            document_set = DocID.objects.order_by('?')[0:query_size]
            doc_ids = [doc.doc_id for doc in document_set]

            if len(doc_ids) == 0:
                print 'No document ids found.\nPlease run the gatherdocids ' \
                      'command first.\n\n'

            query = {
                "query": {
                    "filtered": {
                        "filter": {
                            "ids": {
                                "values": doc_ids
                            }
                        }
                    }
                },
                "aggs": {
                    "words": {
                        "terms": {
                            "field": "tags",
                            "size": 100
                        }
                    }
                },
                "size": 0
            }

            c3 = time.time()
            result = _es().search(index='kb', doc_type='doc', body=query)
            wordcloud = result.get('aggregations').get('words').get('buckets')
            c4 = time.time()

            c2 = time.time()

            elapsed_c = (c2-c1)*1000
            response_times.append(elapsed_c)
            es_time.append((c4-c3)*1000)

            self.stdout.write(str(elapsed_c)+' ES: '+str(sum(es_time)) +
                              ' #results: '+str(len(wordcloud)))
            self.stdout.flush()

        avg = float(sum(response_times)/len(response_times))
        print 'Average response time for generating word clouds from {num} ' \
              'documents: {avg} miliseconds'.format(num=query_size, avg=avg)