def delete(self, endpoint_name, lookup): """Delete method to delete by using mongo query syntax. :param endpoint_name: Name of the endpoint :param lookup: User mongo query syntax. example 1. ``{'_id':123}``, 2. ``{'item_id': {'$in': [123, 234]}}`` :returns: Returns the mongo remove command response. {'n': 12, 'ok': 1} """ backend = self._backend(endpoint_name) search_backend = self._lookup_backend(endpoint_name) docs = self.get_from_mongo(endpoint_name, lookup=lookup, req=ParsedRequest()) ids = [doc[config.ID_FIELD] for doc in docs] removed_ids = ids logger.info("total documents to be removed {}".format(len(ids))) if search_backend and ids: removed_ids = [] # first remove it from search backend, so it won't show up. when this is done - remove it from mongo for _id in ids: try: self.remove_from_search(endpoint_name, _id) removed_ids.append(_id) except NotFoundError: logger.warning('item missing from elastic _id=%s' % (_id, )) removed_ids.append(_id) except: logger.exception('item can not be removed from elastic _id=%s' % (_id, )) backend.remove(endpoint_name, {config.ID_FIELD: {'$in': removed_ids}}) logger.info("Removed {} documents from {}.".format(len(ids), endpoint_name)) if not ids: logger.warn("No documents for {} resource were deleted using lookup {}".format(endpoint_name, lookup))
def delete_docs(self, endpoint_name, docs): """Delete using list of documents.""" backend = self._backend(endpoint_name) search_backend = self._lookup_backend(endpoint_name) ids = [doc[config.ID_FIELD] for doc in docs] removed_ids = ids logger.info("total documents to be removed {}".format(len(ids))) if search_backend and ids: removed_ids = [] # first remove it from search backend, so it won't show up. when this is done - remove it from mongo for doc in docs: try: self.remove_from_search(endpoint_name, doc) removed_ids.append(doc[config.ID_FIELD]) except NotFoundError: logger.warning('item missing from elastic _id=%s' % (doc[config.ID_FIELD], )) removed_ids.append(doc[config.ID_FIELD]) except Exception: logger.exception( 'item can not be removed from elastic _id=%s' % (doc[config.ID_FIELD], )) if len(removed_ids): backend.remove(endpoint_name, {config.ID_FIELD: { '$in': removed_ids }}) logger.info("Removed %d documents from %s.", len(removed_ids), endpoint_name) else: logger.warn("No documents for %s resource were deleted.", endpoint_name) return removed_ids
def remove_expired_from_elastic(ingest_collection): """Remove expired items from elastic which shouldn't be there anymore - expired before previous run.""" ingest = superdesk.get_resource_service(ingest_collection) items = ingest.search({'filter': {'range': {'expiry': {'lt': 'now-5m/m'}}}}) if items.count(): logger.warning('there are expired items in elastic (%d)' % (items.count(), )) for item in items: logger.debug('doc only in elastic item=%s' % (item, )) ingest.remove_from_search(item)
def remove_expired_from_elastic(ingest_collection): """Remove expired items from elastic which shouldn't be there anymore - expired before previous run.""" ingest = superdesk.get_resource_service(ingest_collection) items = ingest.search({'filter': {'range': {'expiry': {'lt': 'now-5m/m'}}}}) if items.count(): logger.warning('there are expired items in elastic (%d)' % (items.count(), )) for item in items: logger.debug('doc only in elastic item=%s' % (item, )) ingest.remove_from_search(item.get('_id'))
def _try_get_lock(self, key, owner, expire): """Log warning in case lock is gained after expiry. This should not happen in general, locks should be released. Consider increasing lock time. """ lock_info = self.get_lock_info(key) locked = super()._try_get_lock(key, owner, expire) if locked and lock_info and lock_info['locked']: logger.warning('Lock %s expired', key) return locked
def run(self, max_days=3, item_id=None, chunk_size=1000): now_utc = utcnow() # If we're generating stats for a single item, then # don't set max_days, as we want to process all history records # for the provided item if item_id is not None: max_days = 0 try: max_days = float(max_days) except (ValueError, TypeError): max_days = 3 gte = None if max_days <= 0.0 else utcnow() - timedelta(days=max_days) try: chunk_size = int(chunk_size) except (ValueError, TypeError): chunk_size = 1000 chunk_size = None if chunk_size <= 0 else chunk_size logger.info( 'Starting to generate archive statistics: {}. gte={}. item_id={}. chunk_size={}' .format(now_utc, gte, item_id, chunk_size)) lock_name = get_lock_id('analytics', 'gen_archive_statistics') if not lock(lock_name, expire=610): logger.info('Generate archive statistics task is already running.') return items_processed = 0 failed_ids = [] num_history_items = 0 try: items_processed, failed_ids, num_history_items = self.generate_stats( item_id, gte, chunk_size) except Exception: logger.exception('Failed to generate archive stats') finally: unlock(lock_name) if len(failed_ids) > 0: logger.warning('Failed to generate stats for items {}'.format( ', '.join(failed_ids))) duration = (utcnow() - now_utc).total_seconds() logger.info( 'Finished generating stats for {} items ({} history entries). Duration: {} seconds' .format(items_processed, num_history_items, int(duration)))
def _change_request(self, endpoint_name, id, updates, original): backend = self._backend(endpoint_name) search_backend = self._lookup_backend(endpoint_name) try: backend.update(endpoint_name, id, updates, original) push_notification("resource:updated", _id=str(id), resource=endpoint_name, fields=get_diff_keys(updates, original)) except eve.io.base.DataLayer.OriginalChangedError: if not backend.find_one(endpoint_name, req=None, _id=id) and search_backend: # item is in elastic, not in mongo - not good logger.warn( "Item is missing in mongo resource={} id={}".format( endpoint_name, id)) item = search_backend.find_one(endpoint_name, req=None, _id=id) if item: self.remove_from_search(endpoint_name, item) raise SuperdeskApiError.notFoundError() else: # item is there, but no change was done - ok logger.warning( "Item was not updated in mongo.", extra=dict( id=id, resource=endpoint_name, updates=updates, ), ) return updates if search_backend: doc = backend.find_one(endpoint_name, req=None, _id=id) if not doc: # there is no doc in mongo, remove it from elastic logger.warn( "Item is missing in mongo resource={} id={}".format( endpoint_name, id)) item = search_backend.find_one(endpoint_name, req=None, _id=id) if item: self.remove_from_search(endpoint_name, item) raise SuperdeskApiError.notFoundError() search_backend.update(endpoint_name, id, doc) return updates
def remove_expired_from_elastic(ingest_collection): """Remove expired items from elastic which shouldn't be there anymore - expired before previous run.""" ingest = superdesk.get_resource_service(ingest_collection) items = ingest.search( {"filter": { "range": { "expiry": { "lt": "now-5m/m" } } }}) if items.count(): logger.warning("there are expired items in elastic (%d)" % (items.count(), )) for item in items: logger.debug("doc only in elastic item=%s" % (item, )) ingest.remove_from_search(item)
def transtype_metadata(doc, original=None): """Change the type of metadata coming from client to match expected type in database Some metadata (e.g. custom fields) are sent as plain text while an other type is expected in database (e.g. datetime). This method check those metadata and update them. :param doc: document to be transtyped (will be modified in place) :param original: original document in case of update """ # For now only fields of type "date" in the "extra" dict are handled. extra = doc.get("extra") if not extra: return if original is None: original = {} try: profile_id = doc.get("profile") or original["profile"] except KeyError: # profile may be missing with some items in tests logger.warning("`profile` is not available in doc") return ctypes_service = get_resource_service("content_types") profile = ctypes_service.find_one(None, _id=profile_id) if profile is None: return for key, value in extra.items(): try: value_type = profile["schema"][key]["type"] except KeyError: continue if value_type == "date": if value and type(value) != datetime: try: extra[key] = date_parse(value) except Exception as e: logger.warning("Can't parse {key}: {reason}".format( key=key, reason=e))
def process_timelines(self, items, failed_ids): statistics_service = get_resource_service('archive_statistics') items_to_create = [] rewrites = [] for item_id, item in items.items(): try: self.gen_stats_from_timeline(item) except Exception: logger.exception('Failed to generate stats for item {}'.format(item_id)) failed_ids.append(item_id) continue if item['updates'].get('rewrite_of') and \ (item['updates'].get('time_to_first_publish') or 0) > 0: rewrites.append(item_id) if not item['item'].get(config.ID_FIELD): item['updates'][config.ID_FIELD] = item_id item['updates']['stats_type'] = 'archive' items_to_create.append(item['updates']) else: try: statistics_service.patch( item_id, item['updates'] ) except Exception: logger.exception('Failed to update stats for item {}. updates={}'.format( item_id, item.get('updates') )) failed_ids.append(item_id) if len(items_to_create) > 0: try: statistics_service.post(items_to_create) except Exception: item_ids = [item.get(config.ID_FIELD) for item in items_to_create] logger.exception('Failed to create stat entries for items {}'.format( ', '.join(item_ids) )) failed_ids.extend(failed_ids) for item_id in rewrites: item = items[item_id] updated_at = item['updates'].get('firstpublished') if not updated_at: logger.warning('Failed {}, updated_at not defined'.format(item_id)) continue original_id = item['updates'].get('rewrite_of') if not original_id: logger.warning('Failed {}, original_id not defined'.format(item_id)) continue original = statistics_service.find_one(req=None, _id=original_id) if not original: logger.warning('Failed {}, original not found'.format(item_id)) continue published_at = original.get('firstpublished') if not published_at: logger.warning('Failed {}, published_at not defined'.format(original_id)) continue statistics_service.patch( original_id, {'time_to_next_update_publish': (updated_at - published_at).total_seconds()} )