예제 #1
0
    def iter_facet_query_types(self, association):
        """Get an iterator over query types and their associated facets.

        Only facets associated with the query types in the specified manner
        are returned; `association` must be one of IndexerConnection.FacetQueryType_Preferred
        or IndexerConnection.FacetQueryType_Never.

        The iterator returns 2-tuples, in which the first item is the query
        type and the second item is the associated set of facets.

        The return values are suitable for the dict() builtin, for example:

         >>> conn = IndexerConnection('db')
         >>> conn.add_field_action('foo', FieldActions.FACET)
         >>> conn.add_field_action('bar', FieldActions.FACET)
         >>> conn.add_field_action('baz', FieldActions.FACET)
         >>> conn.set_facet_for_query_type('type1', 'foo', conn.FacetQueryType_Preferred)
         >>> conn.set_facet_for_query_type('type1', 'bar', conn.FacetQueryType_Never)
         >>> conn.set_facet_for_query_type('type1', 'baz', conn.FacetQueryType_Never)
         >>> conn.set_facet_for_query_type('type2', 'bar', conn.FacetQueryType_Preferred)
         >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Preferred))
         {'type1': set(['foo']), 'type2': set(['bar'])}
         >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Never))
         {'type1': set(['bar', 'baz'])}

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if 'facets' in _checkxapian.missing_features:
            raise errors.IndexerError(
                "Facets unsupported with this release of xapian")
        return FacetQueryTypeIter(self._facet_query_table, association)
예제 #2
0
    def get_document(self, id):
        """Get the document with the specified unique ID.

        Raises a KeyError if there is no such document.  Otherwise, it returns
        a ProcessedDocument.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        postlist = self._index.postlist('Q' + id)
        try:
            plitem = postlist.next()
        except StopIteration:
            # Unique ID not found
            raise KeyError('Unique ID %r not found' % id)
        try:
            postlist.next()
            raise errors.IndexerError("Multiple documents "  #pragma: no cover
                                      "found with same unique ID")
        except StopIteration:
            # Only one instance of the unique ID found, as it should be.
            pass

        result = ProcessedDocument(self._field_mappings)
        result.id = id
        result._doc = self._index.get_document(plitem.docid)
        return result
예제 #3
0
    def replace(self, document):
        """Replace a document in the search engine index.

        If the document does not have a id set, an exception will be
        raised.

        If the document has a id set, and the id does not already
        exist in the database, this method will have the same effect as add().

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if not hasattr(document, '_doc'):
            # It's not a processed document.
            document = self.process(document)

        # Ensure that we have a id
        id = document.id
        if id is None:
            raise errors.IndexerError(
                "No document ID set for document supplied to replace().")

        xapdoc = document.prepare()
        self._index.replace_document('Q' + id, xapdoc)

        if self._max_mem is not None:
            self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc)
            if self._mem_buffered > self._max_mem:
                self.flush()
예제 #4
0
    def set_metadata(self, key, value):
        """Set an item of metadata stored in the connection.

        The value supplied will be returned by subsequent calls to
        get_metadata() which use the same key.

        Keys with a leading underscore are reserved for internal use - you
        should not use such keys unless you really know what you are doing.

        This will store the value supplied in the database.  It will not be
        visible to readers (ie, search connections) until after the next flush.

        The key is limited to about 200 characters (the same length as a term
        is limited to).  The value can be several megabytes in size.

        To remove an item of metadata, simply call this with a `value`
        parameter containing an empty string.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if not hasattr(self._index, 'set_metadata'):
            raise errors.IndexerError(
                "Version of xapian in use does not support metadata")
        log(self._index.set_metadata, key, value)
예제 #5
0
    def set_facet_for_query_type(self, query_type, facet, association):
        """Set the association between a query type and a facet.

        The value of `association` must be one of
        IndexerConnection.FacetQueryType_Preferred,
        IndexerConnection.FacetQueryType_Never or None. A value of None removes
        any previously set association.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if query_type is None:
            raise errors.IndexerError(
                "Cannot set query type information for None")
        self._assert_facet(facet)
        if query_type not in self._facet_query_table:
            self._facet_query_table[query_type] = {}
        if association is None:
            if facet in self._facet_query_table[query_type]:
                del self._facet_query_table[query_type][facet]
        else:
            self._facet_query_table[query_type][facet] = association
        if self._facet_query_table[query_type] == {}:
            del self._facet_query_table[query_type]
        self._config_modified = True
예제 #6
0
    def get_metadata(self, key):
        """Get an item of metadata stored in the connection.

        This returns a value stored by a previous call to set_metadata.

        If the value is not found, this will return the empty string.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if not hasattr(self._index, 'get_metadata'):
            raise errors.IndexerError(
                "Version of xapian in use does not support metadata")
        return log(self._index.get_metadata, key)
예제 #7
0
    def get_subfacets(self, facet):
        """Get a list of subfacets of a facet.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        return [k for k, v in self._facet_hierarchy.iteritems() if v == facet]
예제 #8
0
    def iter_synonyms(self, prefix=""):
        """Get an iterator over the synonyms.

         - `prefix`: if specified, only synonym keys with this prefix will be
           returned.

        The iterator returns 2-tuples, in which the first item is the key (ie,
        a 2-tuple holding the term or terms which will be synonym expanded,
        followed by the fieldname specified (or None if no fieldname)), and the
        second item is a tuple of strings holding the synonyms for the first
        item.

        These return values are suitable for the dict() builtin, so you can
        write things like:

         >>> conn = IndexerConnection('foo')
         >>> conn.add_synonym('foo', 'bar')
         >>> conn.add_synonym('foo bar', 'baz')
         >>> conn.add_synonym('foo bar', 'foo baz')
         >>> dict(conn.iter_synonyms())
         {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')}

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        return SynonymIter(self._index, self._field_mappings, prefix)
예제 #9
0
    def add_synonym(self,
                    original,
                    synonym,
                    field=None,
                    original_field=None,
                    synonym_field=None):
        """Add a synonym to the index.

         - `original` is the word or words which will be synonym expanded in
           searches (if multiple words are specified, each word should be
           separated by a single space).
         - `synonym` is a synonym for `original`.
         - `field` is the field which the synonym is specific to.  If no field
           is specified, the synonym will be used for searches which are not
           specific to any particular field.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if original_field is None:
            original_field = field
        if synonym_field is None:
            synonym_field = field
        key = self._make_synonym_key(original, original_field)
        # FIXME - this only works for exact fields which have no upper case
        # characters, or single words
        value = self._make_synonym_key(synonym, synonym_field)
        self._index.add_synonym(key, value)
예제 #10
0
    def process(self, document):
        """Process an UnprocessedDocument with the settings in this database.

        The resulting ProcessedDocument is returned.

        Note that this processing will be automatically performed if an
        UnprocessedDocument is supplied to the add() or replace() methods of
        IndexerConnection.  This method is exposed to allow the processing to
        be performed separately, which may be desirable if you wish to manually
        modify the processed document before adding it to the database, or if
        you want to split processing of documents from adding documents to the
        database for performance reasons.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        result = ProcessedDocument(self._field_mappings)
        result.id = document.id
        context = ActionContext(self._index)

        for field in document.fields:
            try:
                actions = self._field_actions[field.name]
            except KeyError:
                # If no actions are defined, just ignore the field.
                continue
            actions.perform(result, field.value, context)

        return result
예제 #11
0
    def get_fields_with_actions(self):
        """Get a list of field names which have actions defined.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        return self._field_actions.keys()
예제 #12
0
    def _assert_facet(self, facet):
        """Raise an error if facet is not a declared facet field.

        """
        for action in self._field_actions[facet]._actions:
            if action == FieldActions.FACET:
                return
        raise errors.IndexerError("Field %r is not indexed as a facet" % facet)
예제 #13
0
    def remove_subfacet(self, subfacet):
        """Remove any existing facet hierarchy relationship for a subfacet.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if subfacet in self._facet_hierarchy:
            del self._facet_hierarchy[subfacet]
            self._config_modified = True
예제 #14
0
    def iterids(self):
        """Get an iterator which returns all the ids in the database.

        The unqiue_ids are currently returned in binary lexicographical sort
        order, but this should not be relied on.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        return PrefixedTermIter('Q', self._index.allterms())
예제 #15
0
    def get_doccount(self):
        """Count the number of documents in the database.

        This count will include documents which have been added or removed but
        not yet flushed().

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        return self._index.get_doccount()
예제 #16
0
    def add(self, document):
        """Add a new document to the search engine index.

        If the document has a id set, and the id already exists in
        the database, an exception will be raised.  Use the replace() method
        instead if you wish to overwrite documents.

        Returns the id of the newly added document (making up a new
        unique ID if no id was set).

        The supplied document may be an instance of UnprocessedDocument, or an
        instance of ProcessedDocument.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if not hasattr(document, '_doc'):
            # It's not a processed document.
            document = self.process(document)

        # Ensure that we have a id
        orig_id = document.id
        if orig_id is None:
            id = self._allocate_id()
            document.id = id
        else:
            id = orig_id
            if self._index.term_exists('Q' + id):
                raise errors.IndexerError(
                    "Document ID of document supplied to add() is not unique.")

        # Add the document.
        xapdoc = document.prepare()
        self._index.add_document(xapdoc)

        if self._max_mem is not None:
            self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc)
            if self._mem_buffered > self._max_mem:
                self.flush()

        if id is not orig_id:
            document.id = orig_id
        return id
예제 #17
0
    def delete(self, id):
        """Delete a document from the search engine index.

        If the id does not already exist in the database, this method
        will have no effect (and will not report an error).

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        self._index.delete_document('Q' + id)
예제 #18
0
    def set_max_mem_use(self, max_mem=None, max_mem_proportion=None):
        """Set the maximum memory to use.

        This call allows the amount of memory to use to buffer changes to be
        set.  This will affect the speed of indexing, but should not result in
        other changes to the indexing.

        Note: this is an approximate measure - the actual amount of memory used
        max exceed the specified amount.  Also, note that future versions of
        xapian are likely to implement this differently, so this setting may be
        entirely ignored.

        The absolute amount of memory to use (in bytes) may be set by setting
        max_mem.  Alternatively, the proportion of the available memory may be
        set by setting max_mem_proportion (this should be a value between 0 and
        1).

        Setting too low a value will result in excessive flushing, and very
        slow indexing.  Setting too high a value will result in excessive
        buffering, leading to swapping, and very slow indexing.

        A reasonable default for max_mem_proportion for a system which is
        dedicated to indexing is probably 0.5: if other tasks are also being
        performed on the system, the value should be lowered.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if max_mem is not None and max_mem_proportion is not None:
            raise errors.IndexerError("Only one of max_mem and "
                                      "max_mem_proportion may be specified")

        if max_mem is None and max_mem_proportion is None:
            self._max_mem = None

        if max_mem_proportion is not None:
            physmem = memutils.get_physical_memory()
            if physmem is not None:
                max_mem = int(physmem * max_mem_proportion)

        self._max_mem = max_mem
예제 #19
0
    def clear_synonyms(self, original, field=None):
        """Remove all synonyms for a word (or phrase).

         - `field` is the field which this synonym is specific to.  If no field
           is specified, the synonym will be used for searches which are not
           specific to any particular field.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        key = self._make_synonym_key(original, field)
        self._index.clear_synonyms(key)
예제 #20
0
    def add_term(self, field, term, wdfinc=1, positions=None):
        """Add a term to the document.

        Terms are the main unit of information used for performing searches.

        - `field` is the field to add the term to.
        - `term` is the term to add.
        - `wdfinc` is the value to increase the within-document-frequency
          measure for the term by.
        - `positions` is the positional information to add for the term.
          This may be None to indicate that there is no positional information,
          or may be an integer to specify one position, or may be a sequence of
          integers to specify several positions.  (Note that the wdf is not
          increased automatically for each position: if you add a term at 7
          positions, and the wdfinc value is 2, the total wdf for the term will
          only be increased by 2, not by 14.)

        """
        prefix = self._fieldmappings.get_prefix(field)
        if len(term) > 0:
            # We use the following check, rather than "isupper()" to ensure
            # that we match the check performed by the queryparser, regardless
            # of our locale.
            if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
                prefix = prefix + ':'

        # Note - xapian currently restricts term lengths to about 248
        # characters - except that zero bytes are encoded in two bytes, so
        # in practice a term of length 125 characters could be too long.
        # Xapian will give an error when commit() is called after such
        # documents have been added to the database.
        # As a simple workaround, we give an error here for terms over 220
        # characters, which will catch most occurrences of the error early.
        #
        # In future, it might be good to change to a hashing scheme in this
        # situation (or for terms over, say, 64 characters), where the
        # characters after position 64 are hashed (we obviously need to do this
        # hashing at search time, too).
        if len(prefix + term) > 220:
            raise errors.IndexerError(
                "Field %r is too long: maximum length "
                "220 - was %d (%r)" %
                (field, len(prefix + term), prefix + term))

        if positions is None:
            self._doc.add_term(prefix + term, wdfinc)
        elif isinstance(positions, int):
            self._doc.add_posting(prefix + term, positions, wdfinc)
        else:
            self._doc.add_term(prefix + term, wdfinc)
            for pos in positions:
                self._doc.add_posting(prefix + term, pos, 0)
예제 #21
0
    def flush(self):
        """Apply recent changes to the database.

        If an exception occurs, any changes since the last call to flush() may
        be lost.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if self._config_modified:
            self._store_config()
        self._index.flush()
        self._mem_buffered = 0
예제 #22
0
    def add_subfacet(self, subfacet, facet):
        """Add a subfacet-facet relationship to the facet hierarchy.
        
        Any existing relationship for that subfacet is replaced.

        Raises a KeyError if either facet or subfacet is not a field,
        and an IndexerError if either facet or subfacet is not a facet field.
        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        self._assert_facet(facet)
        self._assert_facet(subfacet)
        self._facet_hierarchy[subfacet] = facet
        self._config_modified = True
예제 #23
0
    def clear_field_actions(self, fieldname):
        """Clear all actions for the specified field.

        This does not report an error if there are already no actions for the
        specified field.

        Note that this change to the configuration will not be preserved on
        disk until the next call to flush().

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if fieldname in self._field_actions:
            del self._field_actions[fieldname]
            self._config_modified = True
예제 #24
0
    def iter_subfacets(self):
        """Get an iterator over the facet hierarchy.

        The iterator returns 2-tuples, in which the first item is the
        subfacet and the second item is its parent facet.

        The return values are suitable for the dict() builtin, for example:

         >>> conn = IndexerConnection('db')
         >>> conn.add_field_action('foo', FieldActions.FACET)
         >>> conn.add_field_action('bar', FieldActions.FACET)
         >>> conn.add_field_action('baz', FieldActions.FACET)
         >>> conn.add_subfacet('foo', 'bar')
         >>> conn.add_subfacet('baz', 'bar')
         >>> dict(conn.iter_subfacets())
         {'foo': 'bar', 'baz': 'bar'}

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if 'facets' in _checkxapian.missing_features:
            raise errors.IndexerError(
                "Facets unsupported with this release of xapian")
        return self._facet_hierarchy.iteritems()
예제 #25
0
    def remove_synonym(self, original, synonym, field=None):
        """Remove a synonym from the index.

         - `original` is the word or words which will be synonym expanded in
           searches (if multiple words are specified, each word should be
           separated by a single space).
         - `synonym` is a synonym for `original`.
         - `field` is the field which this synonym is specific to.  If no field
           is specified, the synonym will be used for searches which are not
           specific to any particular field.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        key = self._make_synonym_key(original, field)
        self._index.remove_synonym(key, synonym.lower())
예제 #26
0
    def add_field_action(self, fieldname, fieldtype, **kwargs):
        """Add an action to be performed on a field.

        Note that this change to the configuration will not be preserved on
        disk until the next call to flush().

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if fieldname in self._field_actions:
            actions = self._field_actions[fieldname]
        else:
            actions = FieldActions(fieldname)
            self._field_actions[fieldname] = actions
        actions.add(self._field_mappings, fieldtype, **kwargs)
        self._config_modified = True
예제 #27
0
    def get_facets_for_query_type(self, query_type, association):
        """Get the set of facets associated with a query type.

        Only those facets associated with the query type in the specified
        manner are returned; `association` must be one of
        IndexerConnection.FacetQueryType_Preferred or
        IndexerConnection.FacetQueryType_Never.

        If the query type has no facets associated with it, None is returned.

        """
        if self._index is None:
            raise errors.IndexerError("IndexerConnection has been closed")
        if query_type not in self._facet_query_table:
            return None
        facet_dict = self._facet_query_table[query_type]
        return set([
            facet for facet, assoc in facet_dict.iteritems()
            if assoc == association
        ])
예제 #28
0
    def add(self, field_mappings, action, **kwargs):
        """Add an action to perform on a field.

        """
        if action in self._unsupported_actions:
            raise errors.IndexerError(
                "Action unsupported with this release of xapian")

        if action not in (
                FieldActions.STORE_CONTENT,
                FieldActions.INDEX_EXACT,
                FieldActions.INDEX_FREETEXT,
                FieldActions.SORTABLE,
                FieldActions.COLLAPSE,
                FieldActions.TAG,
                FieldActions.FACET,
        ):
            raise errors.IndexerError("Unknown field action: %r" % action)

        info = self._action_info[action]

        # Check parameter names
        for key in kwargs.keys():
            if key not in info[1]:
                raise errors.IndexerError(
                    "Unknown parameter name for action %r: %r" %
                    (info[0], key))

        # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we
        # could implement this, the query parser wouldn't know what to do with
        # searches.
        if action == FieldActions.INDEX_EXACT:
            if FieldActions.INDEX_FREETEXT in self._actions:
                raise errors.IndexerError(
                    "Field %r is already marked for indexing "
                    "as free text: cannot mark for indexing "
                    "as exact text as well" % self._fieldname)
        if action == FieldActions.INDEX_FREETEXT:
            if FieldActions.INDEX_EXACT in self._actions:
                raise errors.IndexerError(
                    "Field %r is already marked for indexing "
                    "as exact text: cannot mark for indexing "
                    "as free text as well" % self._fieldname)

        # Fields cannot be indexed as more than one type for "SORTABLE": to
        # implement this, we'd need to use a different prefix for each sortable
        # type, but even then the search end wouldn't know what to sort on when
        # searching.  Also, if they're indexed as "COLLAPSE", the value must be
        # stored in the right format for the type "SORTABLE".
        if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE:
            if action == FieldActions.COLLAPSE:
                sorttype = None
            else:
                try:
                    sorttype = kwargs['type']
                except KeyError:
                    sorttype = 'string'
            kwargs['type'] = sorttype
            action = FieldActions.SORT_AND_COLLAPSE

            try:
                oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE]
            except KeyError:
                oldsortactions = ()

            if len(oldsortactions) > 0:
                for oldsortaction in oldsortactions:
                    oldsorttype = oldsortaction['type']

                if sorttype == oldsorttype or oldsorttype is None:
                    # Use new type
                    self._actions[action] = []
                elif sorttype is None:
                    # Use old type
                    return
                else:
                    raise errors.IndexerError("Field %r is already marked for "
                                              "sorting, with a different "
                                              "sort type" % self._fieldname)

        if 'prefix' in info[3]:
            field_mappings.add_prefix(self._fieldname)
        if 'slot' in info[3]:
            purposes = info[3]['slot']
            if isinstance(purposes, basestring):
                field_mappings.add_slot(self._fieldname, purposes)
            else:
                slotnum = None
                for purpose in purposes:
                    slotnum = field_mappings.get_slot(self._fieldname, purpose)
                    if slotnum is not None:
                        break
                for purpose in purposes:
                    field_mappings.add_slot(self._fieldname,
                                            purpose,
                                            slotnum=slotnum)

        # Make an entry for the action
        if action not in self._actions:
            self._actions[action] = []

        # Check for repetitions of actions
        for old_action in self._actions[action]:
            if old_action == kwargs:
                return

        # Append the action to the list of actions
        self._actions[action].append(kwargs)