Exemplo n.º 1
0
    def store(self, uid, properties):
        document = Document()
        document.add_value(_VALUE_UID, uid)
        term_generator = TermGenerator()
        term_generator.index_document(document, properties)

        if not self.contains(uid):
            self._database.add_document(document)
        else:
            self._database.replace_document(_PREFIX_FULL_VALUE + \
                _PREFIX_UID + uid, document)

        self._flush(True)
Exemplo n.º 2
0
    def store(self, uid, properties):
        document = Document()
        document.add_value(_VALUE_UID, uid)
        term_generator = TermGenerator()
        term_generator.index_document(document, properties)

        if not self.contains(uid):
            self._database.add_document(document)
        else:
            self._database.replace_document(_PREFIX_FULL_VALUE + \
                _PREFIX_UID + uid, document)

        self._flush(True)
Exemplo n.º 3
0
def split_unicode(text, language='en'):
    xdoc = Document()
    _index_unicode(xdoc, text, '', language, 1)
    words = []
    for term_list_item in xdoc:
        term = unicode(term_list_item.term, 'utf-8')
        for termpos in term_list_item.positer:
            words.append((termpos, term))
    words.sort()
    return [word[1] for word in words]
Exemplo n.º 4
0
def _make_PhraseQuery(field_cls, value, prefix):
    # Get the words
    # XXX It's too complex (slow), we must use xapian
    #     Problem => _index_cjk
    xdoc = Document()
    # XXX Language = 'en' by default
    _index(xdoc, field_cls, value, prefix, 'en')
    words = []
    for term_list_item in xdoc:
        term = term_list_item.term
        for termpos in term_list_item.positer:
            words.append((termpos, term))
    words.sort()
    words = [ word[1] for word in words ]

    # Make the query
    return Query(OP_PHRASE, words)
Exemplo n.º 5
0
    def get_xdoc_from_document(self, doc_values):
        """Return (abspath, term, xdoc) from the document (resource or values as dict)
        """
        term = None
        metadata = self._metadata
        # Check the input
        if type(doc_values) is not dict:
            raise NotImplementedError('Deprecated: doc_values should be a dict')
        fields = self._fields
        abspath = doc_values['abspath']
        # Make the xapian document
        metadata_modified = False
        xdoc = Document()
        for name, value in doc_values.iteritems():
            if name not in fields:
                warn_not_indexed_nor_stored(name)
            field_cls = fields[name]

            # New field ?
            if name not in metadata:
                info = metadata[name] = self._get_info(field_cls, name)
                metadata_modified = True
            else:
                info = metadata[name]

            # XXX This comment is no longer valid, now the key field is
            #     always abspath with field_cls = String
            # Store the key field with the prefix 'Q'
            # Comment: the key field is indexed twice, but we must do it
            #          one => to index (as the others)
            #          two => to index without split
            #          the problem is that "_encode != _index"
            if name == 'abspath':
                key_value = _reduce_size(_encode(field_cls, value))
                term = 'Q' + key_value
                xdoc.add_term(term)

            # A multilingual value?
            if isinstance(value, dict):
                for language, lang_value in value.iteritems():
                    lang_name = name + '_' + language

                    # New field ?
                    if lang_name not in metadata:
                        lang_info = self._get_info(field_cls, lang_name)
                        lang_info['from'] = name
                        metadata[lang_name] = lang_info
                        metadata_modified = True
                    else:
                        lang_info = metadata[lang_name]

                    # The value can be None
                    if lang_value is not None:
                        # Is stored ?
                        if 'value' in lang_info:
                            xdoc.add_value(lang_info['value'],
                                           _encode(field_cls, lang_value))
                        # Is indexed ?
                        if 'prefix' in lang_info:
                            # Comment: Index twice
                            _index(xdoc, field_cls, lang_value,
                                   info['prefix'], language)
                            _index(xdoc, field_cls, lang_value,
                                   lang_info['prefix'], language)
            # The value can be None
            elif value is not None:
                # Is stored ?
                if 'value' in info:
                    xdoc.add_value(info['value'], _encode(field_cls, value))
                # Is indexed ?
                if 'prefix' in info:
                    # By default language='en'
                    _index(xdoc, field_cls, value, info['prefix'], 'en')
        # Store metadata ?
        if metadata_modified:
            metadata = self._metadata
            self._db.set_metadata('metadata', dumps(metadata))
        # Ok
        return abspath, term, xdoc
Exemplo n.º 6
0
    def index_document(self, document):
        """Add a new document.
        """
        db = self._db
        metadata = self._metadata
        fields = self._fields

        # Check the input
        if type(document) is dict:
            doc_values = document
        else:
            doc_values = document.get_catalog_values()

        # Make the xapian document
        metadata_modified = False
        xdoc = Document()
        for name, value in doc_values.iteritems():
            if name not in fields:
                warn_not_indexed_nor_stored(name)
            field_cls = fields[name]

            # New field ?
            if name not in metadata:
                info = metadata[name] = self._get_info(field_cls, name)
                metadata_modified = True
            else:
                info = metadata[name]

            # XXX This comment is no longer valid, now the key field is
            #     always abspath with field_cls = String
            # Store the key field with the prefix 'Q'
            # Comment: the key field is indexed twice, but we must do it
            #          one => to index (as the others)
            #          two => to index without split
            #          the problem is that "_encode != _index"
            if name == 'abspath':
                key_value = _reduce_size(_encode(field_cls, value))
                xdoc.add_term('Q' + key_value)

            # A multilingual value?
            if isinstance(value, dict):
                for language, lang_value in value.iteritems():
                    lang_name = name + '_' + language

                    # New field ?
                    if lang_name not in metadata:
                        lang_info = self._get_info(field_cls, lang_name)
                        lang_info['from'] = name
                        metadata[lang_name] = lang_info
                        metadata_modified = True
                    else:
                        lang_info = metadata[lang_name]

                    # The value can be None
                    if lang_value is not None:
                        # Is stored ?
                        if 'value' in lang_info:
                            xdoc.add_value(lang_info['value'],
                                           _encode(field_cls, lang_value))
                        # Is indexed ?
                        if 'prefix' in lang_info:
                            # Comment: Index twice
                            _index(xdoc, field_cls, lang_value,
                                   info['prefix'], language)
                            _index(xdoc, field_cls, lang_value,
                                   lang_info['prefix'], language)
            # The value can be None
            elif value is not None:
                # Is stored ?
                if 'value' in info:
                    xdoc.add_value(info['value'], _encode(field_cls, value))
                # Is indexed ?
                if 'prefix' in info:
                    # By default language='en'
                    _index(xdoc, field_cls, value, info['prefix'], 'en')

        # TODO: Don't store two documents with the same key field!

        # Save the doc
        db.add_document(xdoc)

        # Store metadata ?
        if metadata_modified:
            db.set_metadata('metadata', dumps(metadata))
Exemplo n.º 7
0
    def get_xdoc_from_document(self, doc_values):
        """Return (abspath, term, xdoc) from the document (resource or values as dict)
        """
        term = None
        metadata = self._metadata
        # Check the input
        if type(doc_values) is not dict:
            raise NotImplementedError('Deprecated: doc_values should be a dict')
        fields = self._fields
        abspath = doc_values['abspath']
        # Make the xapian document
        metadata_modified = False
        xdoc = Document()
        for name, value in doc_values.iteritems():
            if name not in fields:
                warn_not_indexed_nor_stored(name)
            field_cls = fields[name]

            # New field ?
            if name not in metadata:
                info = metadata[name] = self._get_info(field_cls, name)
                metadata_modified = True
            else:
                info = metadata[name]

            # XXX This comment is no longer valid, now the key field is
            #     always abspath with field_cls = String
            # Store the key field with the prefix 'Q'
            # Comment: the key field is indexed twice, but we must do it
            #          one => to index (as the others)
            #          two => to index without split
            #          the problem is that "_encode != _index"
            if name == 'abspath':
                key_value = _reduce_size(_encode(field_cls, value))
                term = 'Q' + key_value
                xdoc.add_term(term)

            # A multilingual value?
            if isinstance(value, dict):
                for language, lang_value in value.iteritems():
                    lang_name = name + '_' + language

                    # New field ?
                    if lang_name not in metadata:
                        lang_info = self._get_info(field_cls, lang_name)
                        lang_info['from'] = name
                        metadata[lang_name] = lang_info
                        metadata_modified = True
                    else:
                        lang_info = metadata[lang_name]

                    # The value can be None
                    if lang_value is not None:
                        # Is stored ?
                        if 'value' in lang_info:
                            xdoc.add_value(lang_info['value'],
                                           _encode(field_cls, lang_value))
                        # Is indexed ?
                        if 'prefix' in lang_info:
                            # Comment: Index twice
                            _index(xdoc, field_cls, lang_value,
                                   info['prefix'], language)
                            _index(xdoc, field_cls, lang_value,
                                   lang_info['prefix'], language)
            # The value can be None
            elif value is not None:
                # Is stored ?
                if 'value' in info:
                    xdoc.add_value(info['value'], _encode(field_cls, value))
                # Is indexed ?
                if 'prefix' in info:
                    # By default language='en'
                    _index(xdoc, field_cls, value, info['prefix'], 'en')
        # Store metadata ?
        if metadata_modified:
            metadata = self._metadata
            self._db.set_metadata('metadata', dumps(metadata))
        # Ok
        return abspath, term, xdoc
Exemplo n.º 8
0
    def index_document(self, document):
        """Add a new document.
        """
        db = self._db
        metadata = self._metadata
        fields = self._fields

        # Check the input
        if type(document) is dict:
            doc_values = document
        elif isinstance(document, CatalogAware):
            doc_values = document.get_catalog_values()
        else:
            raise ValueError, 'the document must be a CatalogAware object'

        # Make the xapian document
        metadata_modified = False
        xdoc = Document()
        for name, value in doc_values.iteritems():
            field_cls = fields[name]

            # New field ?
            if name not in metadata:
                info = metadata[name] = self._get_info(field_cls, name)
                metadata_modified = True
            else:
                info = metadata[name]

            # A multilingual value ?
            if isinstance(value, dict):
                for language, lang_value in value.iteritems():
                    lang_name = name + '_' + language

                    # New field ?
                    if lang_name not in metadata:
                        lang_info = self._get_info(field_cls, lang_name)
                        lang_info['from'] = name
                        metadata[lang_name] = lang_info
                        metadata_modified = True
                    else:
                        lang_info = metadata[lang_name]

                    # The value can be None
                    if lang_value is not None:
                        # Is stored ?
                        if 'value' in lang_info:
                            xdoc.add_value(lang_info['value'],
                                           _encode(field_cls, lang_value))
                        # Is indexed ?
                        if 'prefix' in lang_info:
                            # Comment: Index twice
                            _index(xdoc, field_cls, lang_value,
                                   info['prefix'], language)
                            _index(xdoc, field_cls, lang_value,
                                   lang_info['prefix'], language)
            # The value can be None
            elif value is not None:
                # Is stored ?
                if 'value' in info:
                    xdoc.add_value(info['value'], _encode(field_cls, value))
                # Is indexed ?
                if 'prefix' in info:
                    # By default language='en'
                    _index(xdoc, field_cls, value, info['prefix'], 'en')

        # Store the key field with the prefix 'Q'
        # Comment: the key field is indexed twice, but we must do it
        #          one => to index (as the others)
        #          two => to index without split
        #          the problem is that "_encode != _index"
        key_field = self._key_field
        if (key_field is None or key_field not in doc_values or
            doc_values[key_field] is None):
            raise ValueError, 'the "key_field" value is compulsory'
        data = _reduce_size(_encode(fields[key_field], doc_values[key_field]))
        xdoc.add_term('Q' + data)

        # TODO: Don't store two documents with the same key field!

        # Save the doc
        db.add_document(xdoc)

        # Store metadata ?
        if metadata_modified:
            db.set_metadata('metadata', dumps(metadata))