def test_transaction_objects_mapped_for_all_models(db, default_namespace):
    """
    Test that all subclasses of HasRevisions are mapped by the
    transaction_objects() function.

    """
    assert set(HasRevisions.__subclasses__()).issubset(transaction_objects().values())
def test_transaction_objects_mapped_for_all_models(db, default_namespace):
    """
    Test that all subclasses of HasRevisions are mapped by the
    transaction_objects() function.

    """
    assert set(HasRevisions.__subclasses__()).issubset(transaction_objects().values())
示例#3
0
文件: search.py 项目: GEverding/inbox
    def _run(self):
        """
        Index into Elasticsearch the threads, messages of all namespaces.

        """
        # Indexing is namespace agnostic.
        # Note that although this means we do not restrict the Transaction
        # table query (via the format_transactions_after_pointer() call below)
        # to a namespace, since we pass a `result_limit` (== chunk_size)
        # argument, the query should still be performant.
        namespace_id = None

        # Only index messages, threads.
        object_types = transaction_objects()
        exclude_types = [
            api_name for model_name, api_name in object_types.iteritems()
            if model_name not in ['message', 'thread']
        ]

        with session_scope() as db_session:
            pointer = db_session.query(SearchIndexCursor).first()
            self.transaction_pointer = pointer.transaction_id if pointer else 0

        self.log.info('Starting search-index service',
                      transaction_pointer=self.transaction_pointer)

        while True:
            with session_scope() as db_session:
                deltas, new_pointer = format_transactions_after_pointer(
                    namespace_id, self.transaction_pointer, db_session,
                    self.chunk_size, _format_transaction_for_search,
                    exclude_types)

            # TODO[k]: We ideally want to index chunk_size at a time.
            # This currently indexes <= chunk_size, and it varies each time.
            if new_pointer is not None and \
                    new_pointer != self.transaction_pointer:

                self.index(deltas)
                self.update_pointer(new_pointer)
            else:
                sleep(self.poll_interval)
示例#4
0
文件: search.py 项目: GEverding/inbox
    def _run(self):
        """
        Index into Elasticsearch the threads, messages of all namespaces.

        """
        # Indexing is namespace agnostic.
        # Note that although this means we do not restrict the Transaction
        # table query (via the format_transactions_after_pointer() call below)
        # to a namespace, since we pass a `result_limit` (== chunk_size)
        # argument, the query should still be performant.
        namespace_id = None

        # Only index messages, threads.
        object_types = transaction_objects()
        exclude_types = [api_name for model_name, api_name in
                         object_types.iteritems() if model_name not in
                         ['message', 'thread']]

        with session_scope() as db_session:
            pointer = db_session.query(SearchIndexCursor).first()
            self.transaction_pointer = pointer.transaction_id if pointer else 0

        self.log.info('Starting search-index service',
                      transaction_pointer=self.transaction_pointer)

        while True:
            with session_scope() as db_session:
                deltas, new_pointer = format_transactions_after_pointer(
                    namespace_id, self.transaction_pointer, db_session,
                    self.chunk_size, _format_transaction_for_search,
                    exclude_types)

            # TODO[k]: We ideally want to index chunk_size at a time.
            # This currently indexes <= chunk_size, and it varies each time.
            if new_pointer is not None and \
                    new_pointer != self.transaction_pointer:

                self.index(deltas)
                self.update_pointer(new_pointer)
            else:
                sleep(self.poll_interval)
示例#5
0
    def index(self, transactions, db_session):
        """
        Translate database operations to Elasticsearch index operations
        and perform them.

        """
        namespace_map = defaultdict(lambda: defaultdict(list))

        for trx in transactions:
            namespace_id = trx.namespace.public_id
            type_ = trx.object_type
            if trx.command == 'delete':
                operation = 'delete'
                api_repr = {'id': trx.object_public_id}
            else:
                operation = 'index'
                object_cls = transaction_objects()[trx.object_type]
                obj = db_session.query(object_cls).get(trx.record_id)
                if obj is None:
                    continue
                api_repr = encode(obj, namespace_public_id=namespace_id)

            namespace_map[namespace_id][type_].append((operation, api_repr))

        self.log.info('namespaces to index count', count=len(namespace_map))

        for namespace_id in namespace_map:
            engine = NamespaceSearchEngine(namespace_id, create_index=True)

            messages = namespace_map[namespace_id]['message']
            message_count = engine.messages.bulk_index(messages) if messages \
                else 0

            threads = namespace_map[namespace_id]['thread']
            thread_count = engine.threads.bulk_index(threads) if threads \
                else 0

            self.log.info('per-namespace index counts',
                          namespace_id=namespace_id,
                          message_count=message_count,
                          thread_count=thread_count)
示例#6
0
def format_transactions_after_pointer(namespace,
                                      pointer,
                                      db_session,
                                      result_limit,
                                      exclude_types=None,
                                      include_types=None,
                                      exclude_folders=True,
                                      expand=False):
    """
    Return a pair (deltas, new_pointer), where deltas is a list of change
    events, represented as dictionaries:
    {
      "object": <API object type, e.g. "thread">,
      "event": <"create", "modify", or "delete>,
      "attributes": <API representation of the object for insert/update events>
      "cursor": <public_id of the transaction>
    }

    and new_pointer is the integer id of the last included transaction

    Arguments
    ---------
    namespace_id: int
        Id of the namespace for which to get changes.
    pointer: int
        Process transactions starting after this id.
    db_session: new_session
        database session
    result_limit: int
        Maximum number of results to return. (Because we may roll up multiple
        changes to the same object, fewer results can be returned.)
    format_transaction_fn: function pointer
        Function that defines how to format the transactions.
    exclude_types: list, optional
        If given, don't include transactions for these types of objects.

    """
    exclude_types = set(exclude_types) if exclude_types else set()
    # Begin backwards-compatibility shim -- suppress new object types for now,
    # because clients may not be able to deal with them.
    exclude_types.add('account')
    if exclude_folders is True:
        exclude_types.update(('folder', 'label'))
    # End backwards-compatibility shim.

    last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session)
    if last_trx == pointer:
        return ([], pointer)

    while True:
        # deleted_at condition included to allow this query to be satisfied via
        # the legacy index on (namespace_id, deleted_at) for performance.
        # Also need to explicitly specify the index hint because the query
        # planner is dumb as nails and otherwise would make this super slow for
        # some values of namespace_id and pointer.
        # TODO(emfree): Remove this hack and ensure that the right index (on
        # namespace_id only) exists.
        transactions = db_session.query(Transaction). \
            filter(
                Transaction.id > pointer,
                Transaction.namespace_id == namespace.id,
                Transaction.deleted_at.is_(None)). \
            with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)')

        if exclude_types is not None:
            transactions = transactions.filter(
                ~Transaction.object_type.in_(exclude_types))

        if include_types is not None:
            transactions = transactions.filter(
                Transaction.object_type.in_(include_types))

        transactions = transactions. \
            order_by(asc(Transaction.id)).limit(result_limit).all()

        if not transactions:
            return ([], pointer)

        results = []

        # Group deltas by object type.
        trxs_by_obj_type = collections.defaultdict(list)
        for trx in transactions:
            trxs_by_obj_type[trx.object_type].append(trx)

        for obj_type, trxs in trxs_by_obj_type.items():
            # Build a dictionary mapping pairs (record_id, command) to
            # transaction. If successive modifies for a given record id appear
            # in the list of transactions, this will only keep the latest
            # one (which is what we want).
            latest_trxs = {(trx.record_id, trx.command): trx
                           for trx in sorted(trxs, key=lambda t: t.id)
                           }.values()
            # Load all referenced not-deleted objects.
            ids_to_query = [
                trx.record_id for trx in latest_trxs if trx.command != 'delete'
            ]

            object_cls = transaction_objects()[obj_type]
            query = db_session.query(object_cls).filter(
                object_cls.id.in_(ids_to_query),
                object_cls.namespace_id == namespace.id)
            if object_cls == Thread:
                query = query.options(*Thread.api_loading_options(expand))
            elif object_cls == Message:
                query = query.options(*Message.api_loading_options(expand))
            objects = {obj.id: obj for obj in query}

            for trx in latest_trxs:
                delta = {
                    'object': trx.object_type,
                    'event': EVENT_NAME_FOR_COMMAND[trx.command],
                    'id': trx.object_public_id,
                    'cursor': trx.public_id
                }
                if trx.command != 'delete':
                    obj = objects.get(trx.record_id)
                    if obj is None:
                        continue
                    repr_ = encode(obj,
                                   namespace_public_id=namespace.public_id,
                                   expand=expand)
                    delta['attributes'] = repr_

                results.append((trx.id, delta))

        if results:
            # Sort deltas by id of the underlying transactions.
            results.sort()
            deltas = [d for _, d in results]
            return (deltas, results[-1][0])
        else:
            # It's possible that none of the referenced objects exist any more,
            # meaning the result list is empty. In that case, keep traversing
            # the log until we get actual results or reach the end.
            pointer = transactions[-1].id
示例#7
0
def format_transactions_after_pointer(namespace, pointer, db_session,
                                      result_limit, exclude_types=None,
                                      include_types=None, exclude_folders=True,
                                      exclude_metadata=True, exclude_account=True,
                                      expand=False):
    """
    Return a pair (deltas, new_pointer), where deltas is a list of change
    events, represented as dictionaries:
    {
      "object": <API object type, e.g. "thread">,
      "event": <"create", "modify", or "delete>,
      "attributes": <API representation of the object for insert/update events>
      "cursor": <public_id of the transaction>
    }

    and new_pointer is the integer id of the last included transaction

    Arguments
    ---------
    namespace_id: int
        Id of the namespace for which to get changes.
    pointer: int
        Process transactions starting after this id.
    db_session: new_session
        database session
    result_limit: int
        Maximum number of results to return. (Because we may roll up multiple
        changes to the same object, fewer results can be returned.)
    format_transaction_fn: function pointer
        Function that defines how to format the transactions.
    exclude_types: list, optional
        If given, don't include transactions for these types of objects.

    """
    exclude_types = set(exclude_types) if exclude_types else set()
    # Begin backwards-compatibility shim -- suppress new object types for now,
    # because clients may not be able to deal with them.
    if exclude_folders is True:
        exclude_types.update(('folder', 'label'))
    if exclude_account is True:
        exclude_types.add('account')
    # End backwards-compatibility shim.

    # Metadata is excluded by default, and can only be included by setting the
    # exclude_metadata flag to False. If listed in include_types, remove it.
    if exclude_metadata is True:
        exclude_types.add('metadata')
    if include_types is not None and 'metadata' in include_types:
        include_types.remove('metadata')

    last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session)
    if last_trx == pointer:
        return ([], pointer)

    while True:
        # deleted_at condition included to allow this query to be satisfied via
        # the legacy index on (namespace_id, deleted_at) for performance.
        # Also need to explicitly specify the index hint because the query
        # planner is dumb as nails and otherwise would make this super slow for
        # some values of namespace_id and pointer.
        # TODO(emfree): Remove this hack and ensure that the right index (on
        # namespace_id only) exists.
        transactions = db_session.query(Transaction). \
            filter(
                Transaction.id > pointer,
                Transaction.namespace_id == namespace.id,
                Transaction.deleted_at.is_(None)). \
            with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)')

        if exclude_types is not None:
            transactions = transactions.filter(
                ~Transaction.object_type.in_(exclude_types))

        if include_types is not None:
            transactions = transactions.filter(
                Transaction.object_type.in_(include_types))

        transactions = transactions. \
            order_by(asc(Transaction.id)).limit(result_limit).all()

        if not transactions:
            return ([], pointer)

        results = []

        # Group deltas by object type.
        trxs_by_obj_type = collections.defaultdict(list)
        for trx in transactions:
            trxs_by_obj_type[trx.object_type].append(trx)

        for obj_type, trxs in trxs_by_obj_type.items():
            # Build a dictionary mapping pairs (record_id, command) to
            # transaction. If successive modifies for a given record id appear
            # in the list of transactions, this will only keep the latest
            # one (which is what we want).
            latest_trxs = {(trx.record_id, trx.command): trx for trx in
                           sorted(trxs, key=lambda t: t.id)}.values()
            # Load all referenced not-deleted objects.
            ids_to_query = [trx.record_id for trx in latest_trxs
                            if trx.command != 'delete']

            object_cls = transaction_objects()[obj_type]

            if object_cls == Account:
                # The base query for Account queries the /Namespace/ table
                # since the API-returned "`account`" is a `namespace`
                # under-the-hood.
                query = db_session.query(Namespace).join(Account).filter(
                    Account.id.in_(ids_to_query),
                    Namespace.id == namespace.id)

                # Key by /namespace.account_id/ --
                # namespace.id may not be equal to account.id
                # and trx.record_id == account.id for `account` trxs.
                objects = {obj.account_id: obj for obj in query}
            else:
                query = db_session.query(object_cls).filter(
                    object_cls.id.in_(ids_to_query),
                    object_cls.namespace_id == namespace.id)

                if object_cls == Thread:
                    query = query.options(*Thread.api_loading_options(expand))
                elif object_cls == Message:
                    query = query.options(*Message.api_loading_options(expand))

                objects = {obj.id: obj for obj in query}

            for trx in latest_trxs:
                delta = {
                    'object': trx.object_type,
                    'event': EVENT_NAME_FOR_COMMAND[trx.command],
                    'id': trx.object_public_id,
                    'cursor': trx.public_id
                }
                if trx.command != 'delete':
                    obj = objects.get(trx.record_id)
                    if obj is None:
                        continue
                    repr_ = encode(
                        obj, namespace_public_id=namespace.public_id,
                        expand=expand)
                    delta['attributes'] = repr_

                results.append((trx.id, delta))

        if results:
            # Sort deltas by id of the underlying transactions.
            results.sort()
            deltas = [d for _, d in results]
            return (deltas, results[-1][0])
        else:
            # It's possible that none of the referenced objects exist any more,
            # meaning the result list is empty. In that case, keep traversing
            # the log until we get actual results or reach the end.
            pointer = transactions[-1].id
def format_transactions_after_pointer(namespace, pointer, db_session,
                                      result_limit, exclude_types=None,
                                      include_types=None):
    """
    Return a pair (deltas, new_pointer), where deltas is a list of change
    events, represented as dictionaries:
    {
      "object": <API object type, e.g. "thread">,
      "event": <"create", "modify", or "delete>,
      "attributes": <API representation of the object for insert/update events>
      "cursor": <public_id of the transaction>
    }

    and new_pointer is the integer id of the last included transaction

    Arguments
    ---------
    namespace_id: int
        Id of the namespace for which to get changes.
    pointer: int
        Process transactions starting after this id.
    db_session: InboxSession
        database session
    result_limit: int
        Maximum number of results to return. (Because we may roll up multiple
        changes to the same object, fewer results can be returned.)
    format_transaction_fn: function pointer
        Function that defines how to format the transactions.
    exclude_types: list, optional
        If given, don't include transactions for these types of objects.

    """
    while True:
        # deleted_at condition included to allow this query to be satisfied via
        # the legacy index on (namespace_id, deleted_at) for performance.
        # Also need to explicitly specify the index hint because the query
        # planner is dumb as nails and otherwise would make this super slow for
        # some values of namespace_id and pointer.
        # TODO(emfree): Remove this hack and ensure that the right index (on
        # namespace_id only) exists.
        transactions = db_session.query(Transaction). \
            filter(
                Transaction.id > pointer,
                Transaction.namespace_id == namespace.id,
                Transaction.deleted_at.is_(None)). \
            with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)')

        if exclude_types is not None:
            transactions = transactions.filter(
                ~Transaction.object_type.in_(exclude_types))

        if include_types is not None:
            transactions = transactions.filter(
                Transaction.object_type.in_(include_types))

        transactions = transactions. \
            order_by(asc(Transaction.id)).limit(result_limit).all()

        if not transactions:
            return ([], pointer)

        results = []

        # Group deltas by object type.
        trxs_by_obj_type = collections.defaultdict(list)
        for trx in transactions:
            trxs_by_obj_type[trx.object_type].append(trx)

        for obj_type, trxs in trxs_by_obj_type.items():
            # Build a dictionary mapping pairs (record_id, command) to
            # transaction. If successive modifies for a given record id appear
            # in the list of transactions, this will only keep the latest
            # one (which is what we want).
            latest_trxs = {(trx.record_id, trx.command): trx for trx in
                           sorted(trxs, key=lambda t: t.id)}.values()
            # Load all referenced not-deleted objects.
            ids_to_query = [trx.record_id for trx in latest_trxs
                            if trx.command != 'delete']

            object_cls = transaction_objects()[obj_type]
            query = db_session.query(object_cls).filter(
                object_cls.id.in_(ids_to_query),
                object_cls.namespace_id == namespace.id)
            if object_cls in QUERY_OPTIONS:
                query = query.options(*QUERY_OPTIONS[object_cls])
            objects = {obj.id: obj for obj in query}

            for trx in latest_trxs:
                delta = {
                    'object': trx.object_type,
                    'event': EVENT_NAME_FOR_COMMAND[trx.command],
                    'id': trx.object_public_id,
                    'cursor': trx.public_id
                }
                if trx.command != 'delete':
                    obj = objects.get(trx.record_id)
                    if obj is None:
                        continue
                    repr_ = encode(
                        obj, namespace_public_id=namespace.public_id)
                    delta['attributes'] = repr_

                results.append((trx.id, delta))

        if results:
            # Sort deltas by id of the underlying transactions.
            results.sort()
            deltas = [d for _, d in results]
            return (deltas, results[-1][0])
        else:
            # It's possible that none of the referenced objects exist any more,
            # meaning the result list is empty. In that case, keep traversing
            # the log until we get actual results or reach the end.
            pointer = transactions[-1].id
示例#9
0
def format_transactions_after_pointer(namespace,
                                      pointer,
                                      db_session,
                                      result_limit,
                                      exclude_types=None,
                                      include_types=None,
                                      exclude_folders=True,
                                      exclude_metadata=True,
                                      exclude_account=True,
                                      expand=False,
                                      is_n1=False):
    """
    Return a pair (deltas, new_pointer), where deltas is a list of change
    events, represented as dictionaries:
    {
      "object": <API object type, e.g. "thread">,
      "event": <"create", "modify", or "delete>,
      "attributes": <API representation of the object for insert/update events>
      "cursor": <public_id of the transaction>
    }

    and new_pointer is the integer id of the last included transaction

    Arguments
    ---------
    namespace_id: int
        Id of the namespace for which to get changes.
    pointer: int
        Process transactions starting after this id.
    db_session: new_session
        database session
    result_limit: int
        Maximum number of results to return. (Because we may roll up multiple
        changes to the same object, fewer results can be returned.)
    format_transaction_fn: function pointer
        Function that defines how to format the transactions.
    exclude_types: list, optional
        If given, don't include transactions for these types of objects.

    """
    exclude_types = set(exclude_types) if exclude_types else set()
    # Begin backwards-compatibility shim -- suppress new object types for now,
    # because clients may not be able to deal with them.
    if exclude_folders is True:
        exclude_types.update(('folder', 'label'))
    if exclude_account is True:
        exclude_types.add('account')
    # End backwards-compatibility shim.

    # Metadata is excluded by default, and can only be included by setting the
    # exclude_metadata flag to False. If listed in include_types, remove it.
    if exclude_metadata is True:
        exclude_types.add('metadata')
    if include_types is not None and 'metadata' in include_types:
        include_types.remove('metadata')

    last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session)
    if last_trx == pointer:
        return ([], pointer)

    while True:
        transactions = db_session.query(Transaction). \
            filter(
                Transaction.id > pointer,
                Transaction.namespace_id == namespace.id)

        if exclude_types is not None:
            transactions = transactions.filter(
                ~Transaction.object_type.in_(exclude_types))

        if include_types is not None:
            transactions = transactions.filter(
                Transaction.object_type.in_(include_types))

        transactions = transactions. \
            order_by(asc(Transaction.id)).limit(result_limit).all()

        if not transactions:
            return ([], pointer)

        results = []

        # Group deltas by object type.
        trxs_by_obj_type = collections.defaultdict(list)
        for trx in transactions:
            trxs_by_obj_type[trx.object_type].append(trx)

        for obj_type, trxs in trxs_by_obj_type.items():
            # Build a dictionary mapping pairs (record_id, command) to
            # transaction. If successive modifies for a given record id appear
            # in the list of transactions, this will only keep the latest
            # one (which is what we want).
            latest_trxs = {(trx.record_id, trx.command): trx
                           for trx in sorted(trxs, key=lambda t: t.id)
                           }.values()
            # Load all referenced not-deleted objects.
            ids_to_query = [
                trx.record_id for trx in latest_trxs if trx.command != 'delete'
            ]

            object_cls = transaction_objects()[obj_type]

            if object_cls == Account:
                # The base query for Account queries the /Namespace/ table
                # since the API-returned "`account`" is a `namespace`
                # under-the-hood.
                query = db_session.query(Namespace).join(Account).filter(
                    Account.id.in_(ids_to_query), Namespace.id == namespace.id)

                # Key by /namespace.account_id/ --
                # namespace.id may not be equal to account.id
                # and trx.record_id == account.id for `account` trxs.
                objects = {obj.account_id: obj for obj in query}
            else:
                query = db_session.query(object_cls).filter(
                    object_cls.id.in_(ids_to_query),
                    object_cls.namespace_id == namespace.id)

                if object_cls == Thread:
                    query = query.options(*Thread.api_loading_options(expand))
                elif object_cls == Message:
                    query = query.options(*Message.api_loading_options(expand))

                objects = {obj.id: obj for obj in query}

            for trx in latest_trxs:
                delta = {
                    'object': trx.object_type,
                    'event': EVENT_NAME_FOR_COMMAND[trx.command],
                    'id': trx.object_public_id,
                    'cursor': trx.public_id
                }
                if trx.command != 'delete':
                    obj = objects.get(trx.record_id)
                    if obj is None:
                        continue
                    repr_ = encode(obj,
                                   namespace_public_id=namespace.public_id,
                                   expand=expand,
                                   is_n1=is_n1)
                    delta['attributes'] = repr_

                results.append((trx.id, delta))

        if results:
            # Sort deltas by id of the underlying transactions.
            results.sort()
            deltas = [d for _, d in results]
            return (deltas, results[-1][0])
        else:
            # It's possible that none of the referenced objects exist any more,
            # meaning the result list is empty. In that case, keep traversing
            # the log until we get actual results or reach the end.
            pointer = transactions[-1].id