Exemplo n.º 1
0
    def __init__(self, index_id, doc_type, query_class, log):
        self.index_id = index_id
        self.doc_type = doc_type
        self.query_engine = DSLQueryEngine(query_class)

        self.log = log

        # TODO(emfree): probably want to try to keep persistent connections
        # around, instead of creating a new one each time.
        self._connection = new_connection()
Exemplo n.º 2
0
    def __init__(self, index_id, doc_type, query_class):
        # TODO(emfree): probably want to try to keep persistent connections
        # around, instead of creating a new one each time.
        self._connection = new_connection()
        self.index_id = index_id
        self.doc_type = doc_type

        self.query_engine = DSLQueryEngine(query_class)
Exemplo n.º 3
0
class BaseSearchAdaptor(object):
    """
    Adapter between the API and an Elasticsearch backend, for a single index
    and document type.

    """
    def __init__(self, index_id, doc_type, query_class):
        # TODO(emfree): probably want to try to keep persistent connections
        # around, instead of creating a new one each time.
        self._connection = new_connection()
        self.index_id = index_id
        self.doc_type = doc_type

        self.query_engine = DSLQueryEngine(query_class)

    @wrap_es_errors
    def _index_document(self, object_repr, **kwargs):
        """
        (Re)index a document for the object with API representation
        `object_repr`. Creates the actual index for the namespace if it doesn't
        already exist.

        """
        assert self.index_id == object_repr['namespace_id']

        index_args = dict(
            index=self.index_id,
            doc_type=self.doc_type,
            id=object_repr['id'],
            body=object_repr)
        index_args.update(**kwargs)
        try:
            self._connection.index(**index_args)
        except elasticsearch.exceptions.TransportError:
            log.error('Index failure',
                      index=self.index_id, doc_type=self.doc_type,
                      object_id=index_args['_id'])
            raise

    @wrap_es_errors
    def _bulk_index(self, objects, parent=None):
        index_args = []

        for object_repr in objects:
            args = dict(_index=self.index_id,
                        _type=self.doc_type,
                        _id=object_repr['id'],
                        _source=object_repr)

            if parent is not None:
                args.update(dict(_parent=object_repr[parent]))

            index_args.append(args)

        try:
            count, failures = bulk(self._connection, index_args)
        except elasticsearch.exceptions.TransportError:
            # TODO[k]: log here
            log.error('Bulk index failure',
                      index=self.index_id, doc_type=self.doc_type,
                      object_ids=[i['_id'] for i in index_args],
                      failures=failures)
            raise

        return count

    @wrap_es_errors
    def search(self, query, max_results=100, offset=0, explain=True):
        """Perform a search and return the results."""
        dsl_query = self.query_engine.generate_query(query)

        log.debug('search query', query=query, dsl_query=dsl_query)

        raw_results = self._connection.search(
            index=self.index_id,
            doc_type=self.doc_type,
            body=dsl_query,
            size=max_results,
            from_=offset,
            explain=explain)

        self._log_query(query, raw_results)

        api_results = self.query_engine.process_results(raw_results)
        return api_results

    @wrap_es_errors
    def get_mapping(self):
        return self._connection.indices.get_mapping(index=self.index_id,
                                                    doc_type=self.doc_type)

    def _log_query(self, query, raw_results):
        """
        Log query and result info, stripping out actual result bodies but
        keeping ids and metadata.

        """
        log_results = copy.deepcopy(raw_results)
        for hit in log_results['hits']['hits']:
            del hit['_source']
        log.debug('search query results', query=query, results=log_results)
Exemplo n.º 4
0
class BaseSearchAdaptor(object):
    """
    Base adaptor between the Nilas API and Elasticsearch for a single index and
    document type. Subclasses implement the document type specific logic.

    """
    def __init__(self, index_id, doc_type, query_class, log):
        self.index_id = index_id
        self.doc_type = doc_type
        self.query_engine = DSLQueryEngine(query_class)

        self.log = log

        # TODO(emfree): probably want to try to keep persistent connections
        # around, instead of creating a new one each time.
        self._connection = new_connection()

    @wrap_es_errors
    def _index_document(self, object_repr, **kwargs):
        """
        (Re)index a document for the object with Nilas API representation
        `object_repr`.

        """
        assert self.index_id == object_repr['namespace_id']

        index_args = dict(
            index=self.index_id,
            doc_type=self.doc_type,
            id=object_repr['id'],
            body=object_repr)
        index_args.update(**kwargs)
        try:
            self._connection.index(**index_args)
        except elasticsearch.exceptions.TransportError as e:
            self.log.error('Index failure', error=e.error,
                           doc_type=self.doc_type, object_id=index_args['_id'])
            raise

    @wrap_es_errors
    def _bulk(self, objects, parent=None):
        """
        Perform a batch of index operations rather than a single one.

        Arguments
        ---------
        objects:
            list of (op_type, object) tuples.

            op_type defines the index operation to perform
            ('index' for creates, updates and 'delete' for deletes)

            object is a dict of document attributes required for the operation.

        Returns
        -------
        Count of index operations on success, raises SearchEngineError on
        failure.

        """
        index_args = []

        def raise_error(failure):
            for op_type, info in failure.iteritems():
                if info.get('status') not in [None, 404]:
                    return True
            return False

        for op, object_repr in objects:
            args = dict(_op_type=op,
                        _index=self.index_id,
                        _type=self.doc_type,
                        _id=object_repr['id'])

            if op != 'delete':
                args.update(dict(_source=object_repr))

                if parent is not None:
                    args.update(dict(_parent=object_repr[parent]))

            index_args.append(args)

        try:
            count, failures = bulk(self._connection, index_args)
        except elasticsearch.exceptions.TransportError as e:
            self.log.error('Bulk index failure', error=e.error,
                           doc_type=self.doc_type,
                           object_ids=[i['_id'] for i in index_args])
            raise SearchEngineError('Bulk index failure!')
        if count != len(objects):
            self.log.error('Bulk index failure',
                           error='Not all indices created',
                           doc_type=self.doc_type,
                           object_ids=[i['_id'] for i in index_args],
                           failures=failures)

            if any(raise_error(f) for f in failures):
                raise SearchEngineError('Bulk index failure!')

        return count

    @wrap_es_errors
    def search(self, query, sort, max_results=100, offset=0, explain=True):
        """ Perform a search and return the results. """
        dsl_query = self.query_engine.generate_query(query)

        self.log.debug('search query', query=query, dsl_query=dsl_query)

        search_kwargs = dict(index=self.index_id,
                             doc_type=self.doc_type,
                             body=dsl_query,
                             size=max_results,
                             from_=offset,
                             explain=explain)

        # Split this out to a Sort class with subclasses for
        # MessageSort/ThreadSort if we expand sorting to be more flexible.
        if sort != 'relevance':
            if self.doc_type == 'message':
                timestamp_field = 'date'
            if self.doc_type == 'thread':
                timestamp_field = 'last_message_timestamp'
            search_kwargs['sort'] = '{}:desc'.format(timestamp_field)

        raw_results = self._connection.search(**search_kwargs)

        self._log_query(query, raw_results)

        total, api_results = self.query_engine.process_results(raw_results)
        return dict(total=total, results=api_results)

    @wrap_es_errors
    def get_mapping(self):
        return self._connection.indices.get_mapping(index=self.index_id,
                                                    doc_type=self.doc_type)

    def _log_query(self, query, raw_results):
        """
        Log query and result info, stripping out actual result bodies but
        keeping ids and metadata.

        """
        log_results = copy.deepcopy(raw_results)
        for hit in log_results['hits']['hits']:
            del hit['_source']
        self.log.debug('search query results', query=query,
                       results=log_results)
Exemplo n.º 5
0
class BaseSearchAdaptor(object):
    """
    Base adaptor between the Nilas API and Elasticsearch for a single index and
    document type. Subclasses implement the document type specific logic.

    """
    def __init__(self, index_id, doc_type, query_class, log):
        self.index_id = index_id
        self.doc_type = doc_type
        self.query_engine = DSLQueryEngine(query_class)

        self.log = log

        # TODO(emfree): probably want to try to keep persistent connections
        # around, instead of creating a new one each time.
        self._connection = new_connection()

    @wrap_es_errors
    def _index_document(self, object_repr, **kwargs):
        """
        (Re)index a document for the object with Nilas API representation
        `object_repr`.

        """
        assert self.index_id == object_repr['namespace_id']

        index_args = dict(index=self.index_id,
                          doc_type=self.doc_type,
                          id=object_repr['id'],
                          body=object_repr)
        index_args.update(**kwargs)
        try:
            self._connection.index(**index_args)
        except elasticsearch.exceptions.TransportError as e:
            self.log.error('Index failure',
                           error=e.error,
                           doc_type=self.doc_type,
                           object_id=index_args['_id'])
            raise

    @wrap_es_errors
    def _bulk(self, objects, parent=None):
        """
        Perform a batch of index operations rather than a single one.

        Arguments
        ---------
        objects:
            list of (op_type, object) tuples.

            op_type defines the index operation to perform
            ('index' for creates, updates and 'delete' for deletes)

            object is a dict of document attributes required for the operation.

        Returns
        -------
        Count of index operations on success, raises SearchEngineError on
        failure.

        """
        index_args = []

        def raise_error(failure):
            for op_type, info in failure.iteritems():
                if info.get('status') not in [None, 404]:
                    return True
            return False

        for op, object_repr in objects:
            args = dict(_op_type=op,
                        _index=self.index_id,
                        _type=self.doc_type,
                        _id=object_repr['id'])

            if op != 'delete':
                args.update(dict(_source=object_repr))

                if parent is not None:
                    args.update(dict(_parent=object_repr[parent]))

            index_args.append(args)

        try:
            count, failures = bulk(self._connection, index_args)
        except elasticsearch.exceptions.TransportError as e:
            self.log.error('Bulk index failure',
                           error=e.error,
                           doc_type=self.doc_type,
                           object_ids=[i['_id'] for i in index_args])
            raise SearchEngineError('Bulk index failure!')
        if count != len(objects):
            self.log.error('Bulk index failure',
                           error='Not all indices created',
                           doc_type=self.doc_type,
                           object_ids=[i['_id'] for i in index_args],
                           failures=failures)

            if any(raise_error(f) for f in failures):
                raise SearchEngineError('Bulk index failure!')

        return count

    @wrap_es_errors
    def search(self, query, sort, max_results=100, offset=0, explain=True):
        """ Perform a search and return the results. """
        dsl_query = self.query_engine.generate_query(query)

        self.log.debug('search query', query=query, dsl_query=dsl_query)

        search_kwargs = dict(index=self.index_id,
                             doc_type=self.doc_type,
                             body=dsl_query,
                             size=max_results,
                             from_=offset,
                             explain=explain)

        # Split this out to a Sort class with subclasses for
        # MessageSort/ThreadSort if we expand sorting to be more flexible.
        if sort != 'relevance':
            if self.doc_type == 'message':
                timestamp_field = 'date'
            if self.doc_type == 'thread':
                timestamp_field = 'last_message_timestamp'
            search_kwargs['sort'] = '{}:desc'.format(timestamp_field)

        raw_results = self._connection.search(**search_kwargs)

        self._log_query(query, raw_results)

        total, api_results = self.query_engine.process_results(raw_results)
        return dict(total=total, results=api_results)

    @wrap_es_errors
    def get_mapping(self):
        return self._connection.indices.get_mapping(index=self.index_id,
                                                    doc_type=self.doc_type)

    def _log_query(self, query, raw_results):
        """
        Log query and result info, stripping out actual result bodies but
        keeping ids and metadata.

        """
        log_results = copy.deepcopy(raw_results)
        for hit in log_results['hits']['hits']:
            del hit['_source']
        self.log.debug('search query results',
                       query=query,
                       results=log_results)