def sync(self, limit): """ Synchronize a batch of annotations from Postgres to Elasticsearch. Called periodically by a Celery task (see h-periodic). Each time this method runs it considers a fixed number of sync annotation jobs from the queue and for each job: * If the annotation is already the same in Elastic as in the DB then remove the job from the queue * If the annotation is missing from Elastic or different in Elastic than in the DB then re-sync the annotation into Elastic. Leave the job on the queue to be re-checked and removed the next time the method runs. """ jobs = self._get_jobs_from_queue(limit) if not jobs: return {} counts = defaultdict(set) annotation_ids = { URLSafeUUID.hex_to_url_safe(job.kwargs["annotation_id"]) for job in jobs if not job.kwargs.get("force", False) } if annotation_ids: annotations_from_db = self._get_annotations_from_db(annotation_ids) annotations_from_es = self._get_annotations_from_es(annotation_ids) else: annotations_from_db = {} annotations_from_es = {} # Completed jobs that can be removed from the queue. job_complete = [] # IDs of annotations to (re-)add to Elasticsearch because they're # either missing from Elasticsearch or are different in Elasticsearch # than in the DB. annotation_ids_to_sync = set() for job in jobs: annotation_id = URLSafeUUID.hex_to_url_safe( job.kwargs["annotation_id"]) annotation_from_db = annotations_from_db.get(annotation_id) annotation_from_es = annotations_from_es.get(annotation_id) if job.kwargs.get("force", False): annotation_ids_to_sync.add(annotation_id) job_complete.append(job) counts[Queue.Result.SYNCED_FORCED.format( tag=job.tag)].add(annotation_id) counts[Queue.Result.SYNCED_TAG_TOTAL.format( tag=job.tag)].add(annotation_id) counts[Queue.Result.SYNCED_TOTAL].add(annotation_id) counts[Queue.Result.COMPLETED_FORCED.format(tag=job.tag)].add( job.id) counts[Queue.Result.COMPLETED_TAG_TOTAL.format( tag=job.tag)].add(job.id) counts[Queue.Result.COMPLETED_TOTAL].add(job.id) elif not annotation_from_db: job_complete.append(job) counts[Queue.Result.COMPLETED_DELETED.format(tag=job.tag)].add( job.id) counts[Queue.Result.COMPLETED_TAG_TOTAL.format( tag=job.tag)].add(job.id) counts[Queue.Result.COMPLETED_TOTAL].add(job.id) elif not annotation_from_es: annotation_ids_to_sync.add(annotation_id) counts[Queue.Result.SYNCED_MISSING.format( tag=job.tag)].add(annotation_id) counts[Queue.Result.SYNCED_TAG_TOTAL.format( tag=job.tag)].add(annotation_id) counts[Queue.Result.SYNCED_TOTAL].add(annotation_id) elif not self._equal(annotation_from_es, annotation_from_db): annotation_ids_to_sync.add(annotation_id) counts[Queue.Result.SYNCED_DIFFERENT.format( tag=job.tag)].add(annotation_id) counts[Queue.Result.SYNCED_TAG_TOTAL.format( tag=job.tag)].add(annotation_id) counts[Queue.Result.SYNCED_TOTAL].add(annotation_id) else: job_complete.append(job) counts[Queue.Result.COMPLETED_UP_TO_DATE.format( tag=job.tag)].add(job.id) counts[Queue.Result.COMPLETED_TAG_TOTAL.format( tag=job.tag)].add(job.id) counts[Queue.Result.COMPLETED_TOTAL].add(job.id) for job in job_complete: self._db.delete(job) if annotation_ids_to_sync: self._batch_indexer.index(list(annotation_ids_to_sync)) return {key: len(value) for key, value in counts.items()}
def database_id(self, annotation): """Return `annotation.id` in the internal format used within the database.""" return str(uuid.UUID(URLSafeUUID.url_safe_to_hex(annotation.id)))
def url_safe_id(self, job): """Return the URL-safe version of the given job's annotation ID.""" return URLSafeUUID.hex_to_url_safe(job.kwargs["annotation_id"])
def sync(self, limit): """ Synchronize a batch of annotations from Postgres to Elasticsearch. Called periodically by a Celery task (see h-periodic). Each time this method runs it considers a fixed number of sync annotation jobs from the queue and for each job: * If the annotation is already the same in Elastic as in the DB then remove the job from the queue * If the annotation is missing from Elastic or different in Elastic than in the DB then re-sync the annotation into Elastic. Leave the job on the queue to be re-checked and removed the next time the method runs. """ jobs = self._get_jobs_from_queue(limit) if not jobs: return annotation_ids = { URLSafeUUID.hex_to_url_safe(job.kwargs["annotation_id"]) for job in jobs if not job.kwargs.get("force", False) } if annotation_ids: annotations_from_db = self._get_annotations_from_db(annotation_ids) annotations_from_es = self._get_annotations_from_es(annotation_ids) else: annotations_from_db = {} annotations_from_es = {} # Completed jobs that can be removed from the queue. job_complete = [] # IDs of annotations to (re-)add to Elasticsearch because they're # either missing from Elasticsearch or are different in Elasticsearch # than in the DB. annotation_ids_to_sync = set() counts = Counter() for job in jobs: annotation_id = URLSafeUUID.hex_to_url_safe( job.kwargs["annotation_id"]) annotation_from_db = annotations_from_db.get(annotation_id) annotation_from_es = annotations_from_es.get(annotation_id) if job.kwargs.get("force", False): annotation_ids_to_sync.add(annotation_id) job_complete.append(job) counts[Queue.Result.FORCED] += 1 elif not annotation_from_db: job_complete.append(job) counts[Queue.Result.DELETED_FROM_DB] += 1 elif not annotation_from_es: annotation_ids_to_sync.add(annotation_id) counts[Queue.Result.MISSING] += 1 elif annotation_from_es["updated"] != annotation_from_db.updated: annotation_ids_to_sync.add(annotation_id) counts[Queue.Result.OUT_OF_DATE] += 1 else: job_complete.append(job) counts[Queue.Result.UP_TO_DATE] += 1 for job in job_complete: self._db.delete(job) if annotation_ids_to_sync: self._batch_indexer.index(list(annotation_ids_to_sync)) LOG.info(dict(counts))