예제 #1
0
 def _index_img(self, img):
     """Index a single img and ensure that it's been propagated to the search engine"""
     image = search.db_image_to_index(img)
     image.save()
     index = Index(name=settings.ELASTICSEARCH_INDEX)
     index.flush(force=True)
     index.refresh()
예제 #2
0
 def test_retrieve(self):
     """It should be possible to retrieve a database item by auto-id"""
     image = search.db_image_to_index(self.img1)
     image.save()
     id_ = image.meta.id
     image2 = search.Image.get(id=id_)
     assert image2.meta.id == id_
예제 #3
0
def insert_image(chunk_size, max_results=5000, from_file=None):
    count = 0
    success_count = 0
    es = search.init()
    search.Image.init()
    mapping = search.Image._doc_type.mapping
    mapping.save(settings.ELASTICSEARCH_INDEX)

    for chunk in grouper_it(chunk_size, import_from_file(from_file)):
        if not from_file and count >= max_results:  # Load everything if loading from file
            break
        else:
            images = []
            for result in chunk:
                images.append(result)
            if len(images) > 0:
                try:
                    # Bulk update the search engine too
                    search_objs = [
                        search.db_image_to_index(img).to_dict(
                            include_meta=True) for img in images
                    ]
                    models.Image.objects.bulk_create(images)
                    helpers.bulk(es, search_objs)
                    log.debug("*** Committed set of %d images", len(images))
                    success_count += len(images)
                except IntegrityError as e:
                    log.warn("Got one or more integrity errors on batch: %s",
                             e)
                finally:
                    count += len(images)
    return success_count
예제 #4
0
def _update_search_index(img):
    # FIXME This may result in a lot of concurrent requests during batch updates;
    # in those cases consider unregistering this signal and manually batching requests
    # (note that Django's bulk_create will not fire this signal, which is good)
    search_obj = search.db_image_to_index(img)
    if (search_obj.removed_from_source):
        log.debug("Removing image %s from search index", img.identifier)
        search_obj.delete(ignore=404)
    else:
        log.debug("Indexing image %s", img.identifier)
        search_obj.save()
예제 #5
0
def do_index(start, chunk_size):
    end = start + chunk_size + 1
    batches = []
    retries = 0
    try:
        es = search.init(timeout=2000)
        if not settings.DEBUG:
            es.cluster.health(wait_for_status='green', request_timeout=2000)

    except (requests.exceptions.ReadTimeout,
            elasticsearch.exceptions.TransportError) as e:
        log.warn(e)
        log.warn("Skipping batch and retrying after wait")
        time.sleep(RETRY_WAIT)
        return

    log.debug("Starting index in range from %d to %d...", start, end)

    qs = models.Image.objects.filter(removed_from_source=False,
                                     id__gt=start).order_by('id')[0:chunk_size]

    for db_image in server_cursor_query(qs, chunk_size=chunk_size):
        log.debug("Indexing database record %s", db_image.identifier)
        image = search.db_image_to_index(db_image)
        try:
            if len(batches) >= chunk_size:
                if not settings.DEBUG:
                    log.debug("Waiting for green status...")
                    es.cluster.health(wait_for_status='green',
                                      request_timeout=2000)
                helpers.bulk(es, batches)
                log.debug("Pushed batch of %d records to ES", len(batches))
                batches = []  # Clear the batch size
            else:
                batches.append(image.to_dict(include_meta=True))
        except (requests.exceptions.ReadTimeout,
                elasticsearch.exceptions.TransportError,
                elasticsearch.helpers.BulkIndexError) as e:
            if retries < MAX_CONNECTION_RETRIES:
                log.warn("Got timeout: retrying with %d retries remaining",
                         MAX_CONNECTION_RETRIES - retries)
                retries += 1
                time.sleep(RETRY_WAIT)
            else:
                raise
    helpers.bulk(es, batches)
예제 #6
0
def insert_image(walk_func,
                 serialize_func,
                 chunk_size,
                 max_results=5000,
                 **kwargs):
    count = 0
    success_count = 0
    es = search.init()
    search.Image.init()
    mapping = search.Image._doc_type.mapping
    mapping.save(settings.ELASTICSEARCH_INDEX)

    for chunk in grouper_it(chunk_size, walk_func(**kwargs)):
        if max_results is not None and count >= max_results:
            break
        else:
            images = []
            for result in chunk:
                image = serialize_func(result)
                if image:
                    images.append(image)
            if len(images) > 0:
                try:
                    # Bulk update the search engine too
                    if not settings.DEBUG:
                        es.cluster.health(wait_for_status='green',
                                          request_timeout=2000)
                    search_objs = [
                        search.db_image_to_index(img).to_dict(
                            include_meta=True) for img in images
                    ]
                    elasticsearch.helpers.bulk(es, search_objs)
                    models.Image.objects.bulk_create(images)
                    log.debug("*** Committed set of %d images", len(images))
                    success_count += len(images)
                except (requests.exceptions.ReadTimeout,
                        elasticsearch.exceptions.TransportError,
                        elasticsearch.helpers.BulkIndexError,
                        IntegrityError) as e:
                    log.warn("Got one or more integrity errors on batch: %s",
                             e)
                finally:
                    count += len(images)
    return success_count
예제 #7
0
 def test_store(self):
     """It should be possible to index a database item"""
     image = search.db_image_to_index(self.img1)
     image.save()
예제 #8
0
 def _index_img(self, img):
     """Index a single img and ensure that it's been propagated to the search engine"""
     image = search.db_image_to_index(img)
     image.save()
     self.es.indices.refresh(force=True)