def test_transaction_objects_mapped_for_all_models(db, default_namespace): """ Test that all subclasses of HasRevisions are mapped by the transaction_objects() function. """ assert set(HasRevisions.__subclasses__()).issubset(transaction_objects().values())
def _run(self): """ Index into Elasticsearch the threads, messages of all namespaces. """ # Indexing is namespace agnostic. # Note that although this means we do not restrict the Transaction # table query (via the format_transactions_after_pointer() call below) # to a namespace, since we pass a `result_limit` (== chunk_size) # argument, the query should still be performant. namespace_id = None # Only index messages, threads. object_types = transaction_objects() exclude_types = [ api_name for model_name, api_name in object_types.iteritems() if model_name not in ['message', 'thread'] ] with session_scope() as db_session: pointer = db_session.query(SearchIndexCursor).first() self.transaction_pointer = pointer.transaction_id if pointer else 0 self.log.info('Starting search-index service', transaction_pointer=self.transaction_pointer) while True: with session_scope() as db_session: deltas, new_pointer = format_transactions_after_pointer( namespace_id, self.transaction_pointer, db_session, self.chunk_size, _format_transaction_for_search, exclude_types) # TODO[k]: We ideally want to index chunk_size at a time. # This currently indexes <= chunk_size, and it varies each time. if new_pointer is not None and \ new_pointer != self.transaction_pointer: self.index(deltas) self.update_pointer(new_pointer) else: sleep(self.poll_interval)
def _run(self): """ Index into Elasticsearch the threads, messages of all namespaces. """ # Indexing is namespace agnostic. # Note that although this means we do not restrict the Transaction # table query (via the format_transactions_after_pointer() call below) # to a namespace, since we pass a `result_limit` (== chunk_size) # argument, the query should still be performant. namespace_id = None # Only index messages, threads. object_types = transaction_objects() exclude_types = [api_name for model_name, api_name in object_types.iteritems() if model_name not in ['message', 'thread']] with session_scope() as db_session: pointer = db_session.query(SearchIndexCursor).first() self.transaction_pointer = pointer.transaction_id if pointer else 0 self.log.info('Starting search-index service', transaction_pointer=self.transaction_pointer) while True: with session_scope() as db_session: deltas, new_pointer = format_transactions_after_pointer( namespace_id, self.transaction_pointer, db_session, self.chunk_size, _format_transaction_for_search, exclude_types) # TODO[k]: We ideally want to index chunk_size at a time. # This currently indexes <= chunk_size, and it varies each time. if new_pointer is not None and \ new_pointer != self.transaction_pointer: self.index(deltas) self.update_pointer(new_pointer) else: sleep(self.poll_interval)
def index(self, transactions, db_session): """ Translate database operations to Elasticsearch index operations and perform them. """ namespace_map = defaultdict(lambda: defaultdict(list)) for trx in transactions: namespace_id = trx.namespace.public_id type_ = trx.object_type if trx.command == 'delete': operation = 'delete' api_repr = {'id': trx.object_public_id} else: operation = 'index' object_cls = transaction_objects()[trx.object_type] obj = db_session.query(object_cls).get(trx.record_id) if obj is None: continue api_repr = encode(obj, namespace_public_id=namespace_id) namespace_map[namespace_id][type_].append((operation, api_repr)) self.log.info('namespaces to index count', count=len(namespace_map)) for namespace_id in namespace_map: engine = NamespaceSearchEngine(namespace_id, create_index=True) messages = namespace_map[namespace_id]['message'] message_count = engine.messages.bulk_index(messages) if messages \ else 0 threads = namespace_map[namespace_id]['thread'] thread_count = engine.threads.bulk_index(threads) if threads \ else 0 self.log.info('per-namespace index counts', namespace_id=namespace_id, message_count=message_count, thread_count=thread_count)
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None, exclude_folders=True, expand=False): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: new_session database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ exclude_types = set(exclude_types) if exclude_types else set() # Begin backwards-compatibility shim -- suppress new object types for now, # because clients may not be able to deal with them. exclude_types.add('account') if exclude_folders is True: exclude_types.update(('folder', 'label')) # End backwards-compatibility shim. last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session) if last_trx == pointer: return ([], pointer) while True: # deleted_at condition included to allow this query to be satisfied via # the legacy index on (namespace_id, deleted_at) for performance. # Also need to explicitly specify the index hint because the query # planner is dumb as nails and otherwise would make this super slow for # some values of namespace_id and pointer. # TODO(emfree): Remove this hack and ensure that the right index (on # namespace_id only) exists. transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id, Transaction.deleted_at.is_(None)). \ with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)') if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id) }.values() # Load all referenced not-deleted objects. ids_to_query = [ trx.record_id for trx in latest_trxs if trx.command != 'delete' ] object_cls = transaction_objects()[obj_type] query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls == Thread: query = query.options(*Thread.api_loading_options(expand)) elif object_cls == Message: query = query.options(*Message.api_loading_options(expand)) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode(obj, namespace_public_id=namespace.public_id, expand=expand) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None, exclude_folders=True, exclude_metadata=True, exclude_account=True, expand=False): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: new_session database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ exclude_types = set(exclude_types) if exclude_types else set() # Begin backwards-compatibility shim -- suppress new object types for now, # because clients may not be able to deal with them. if exclude_folders is True: exclude_types.update(('folder', 'label')) if exclude_account is True: exclude_types.add('account') # End backwards-compatibility shim. # Metadata is excluded by default, and can only be included by setting the # exclude_metadata flag to False. If listed in include_types, remove it. if exclude_metadata is True: exclude_types.add('metadata') if include_types is not None and 'metadata' in include_types: include_types.remove('metadata') last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session) if last_trx == pointer: return ([], pointer) while True: # deleted_at condition included to allow this query to be satisfied via # the legacy index on (namespace_id, deleted_at) for performance. # Also need to explicitly specify the index hint because the query # planner is dumb as nails and otherwise would make this super slow for # some values of namespace_id and pointer. # TODO(emfree): Remove this hack and ensure that the right index (on # namespace_id only) exists. transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id, Transaction.deleted_at.is_(None)). \ with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)') if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id)}.values() # Load all referenced not-deleted objects. ids_to_query = [trx.record_id for trx in latest_trxs if trx.command != 'delete'] object_cls = transaction_objects()[obj_type] if object_cls == Account: # The base query for Account queries the /Namespace/ table # since the API-returned "`account`" is a `namespace` # under-the-hood. query = db_session.query(Namespace).join(Account).filter( Account.id.in_(ids_to_query), Namespace.id == namespace.id) # Key by /namespace.account_id/ -- # namespace.id may not be equal to account.id # and trx.record_id == account.id for `account` trxs. objects = {obj.account_id: obj for obj in query} else: query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls == Thread: query = query.options(*Thread.api_loading_options(expand)) elif object_cls == Message: query = query.options(*Message.api_loading_options(expand)) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode( obj, namespace_public_id=namespace.public_id, expand=expand) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: InboxSession database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ while True: # deleted_at condition included to allow this query to be satisfied via # the legacy index on (namespace_id, deleted_at) for performance. # Also need to explicitly specify the index hint because the query # planner is dumb as nails and otherwise would make this super slow for # some values of namespace_id and pointer. # TODO(emfree): Remove this hack and ensure that the right index (on # namespace_id only) exists. transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id, Transaction.deleted_at.is_(None)). \ with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)') if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id)}.values() # Load all referenced not-deleted objects. ids_to_query = [trx.record_id for trx in latest_trxs if trx.command != 'delete'] object_cls = transaction_objects()[obj_type] query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls in QUERY_OPTIONS: query = query.options(*QUERY_OPTIONS[object_cls]) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode( obj, namespace_public_id=namespace.public_id) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None, exclude_folders=True, exclude_metadata=True, exclude_account=True, expand=False, is_n1=False): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: new_session database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ exclude_types = set(exclude_types) if exclude_types else set() # Begin backwards-compatibility shim -- suppress new object types for now, # because clients may not be able to deal with them. if exclude_folders is True: exclude_types.update(('folder', 'label')) if exclude_account is True: exclude_types.add('account') # End backwards-compatibility shim. # Metadata is excluded by default, and can only be included by setting the # exclude_metadata flag to False. If listed in include_types, remove it. if exclude_metadata is True: exclude_types.add('metadata') if include_types is not None and 'metadata' in include_types: include_types.remove('metadata') last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session) if last_trx == pointer: return ([], pointer) while True: transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id) if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id) }.values() # Load all referenced not-deleted objects. ids_to_query = [ trx.record_id for trx in latest_trxs if trx.command != 'delete' ] object_cls = transaction_objects()[obj_type] if object_cls == Account: # The base query for Account queries the /Namespace/ table # since the API-returned "`account`" is a `namespace` # under-the-hood. query = db_session.query(Namespace).join(Account).filter( Account.id.in_(ids_to_query), Namespace.id == namespace.id) # Key by /namespace.account_id/ -- # namespace.id may not be equal to account.id # and trx.record_id == account.id for `account` trxs. objects = {obj.account_id: obj for obj in query} else: query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls == Thread: query = query.options(*Thread.api_loading_options(expand)) elif object_cls == Message: query = query.options(*Message.api_loading_options(expand)) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode(obj, namespace_public_id=namespace.public_id, expand=expand, is_n1=is_n1) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id