Exemplo n.º 1
0
    def build_index(self, remove_old=True):

        if remove_old:
            remove_directory(self.search_db_dir)

        self.__xappy = xappy.IndexerConnection(self.search_db_dir)

        self.__xappy.add_field_action("module_uid",
                                      xappy.FieldActions.STORE_CONTENT)

        self.__xappy.add_field_action("keyword_term",
                                      xappy.FieldActions.INDEX_FREETEXT,
                                      nopos=True)

        for module_keyword in self.__keywords:
            for keyword in module_keyword[2]:
                module_doc = xappy.UnprocessedDocument()

                module_doc.fields.append(xappy.Field("module_uid", keyword[0]))

                terms = list(split_word(keyword[1], True))
                module_doc.fields.append(
                    xappy.Field("keyword_term", ' '.join(terms)))

                self.__xappy.add(module_doc)

        self.__xappy.close()
Exemplo n.º 2
0
    def document(self, connection, retry=False):
        """
        return a xapian index document from the context.

        we can introspect the connection to discover relevant fields available.
        """
        doc = xappy.UnprocessedDocument()

        if interfaces.ENABLE_LOGGING:
            log.debug("Indexing Document %r" % self.context)

        # object type
        doc.fields.append(
            xappy.Field("object_type", self.context.__class__.__name__))

        # object kind
        doc.fields.append(
            xappy.Field("object_kind",
                        domain.object_hierarchy_type(self.context)))

        try:
            #TODO: loop thru all available languages and index the translations
            self.index(doc)
        except exceptions.OperationalError, exceptions.InvalidRequestError:
            # detatch the dbapi connection from the pool, and close it
            # and retry the index operation (once)
            log.error("Indexing Connection Hosed, Discarding")
            db_connection = metadata.bind.contextual_connect()
            db_connection.begin().rollback()
            db_connection.detach()
            db_connection.close()
            if not retry:
                return self.document(connection, retry=True)
            raise
    def __buildDoc(self, article):
        if article.getTitle() == None: return None

        doc = xappy.UnprocessedDocument()
        doc.fields.append(xappy.Field("title", article.getTitle()))
        if article.getAbstract() == None:
            pass
        else:
            doc.fields.append(xappy.Field("text", article.getAbstract()))

        #'INDEX_EXACT' - maximum length 220, but prefix "XA" is added to each term in the document
        #maximum length 218
        for chemical in [
                chemical for chemical in article.getChemicals()
                if len(chemical) < 219
        ]:
            doc.fields.append(xappy.Field("chemical_exact", chemical))

        for keyword in article.getKeywords():
            doc.fields.append(xappy.Field("keyword", keyword))

        for mesh in article.getMeSH():
            doc.fields.append(xappy.Field("mesh", mesh))

        doc.id = str(article.getPMID())
        return doc
Exemplo n.º 4
0
    def document(self):    
        self.doc.id = self.resource.url
        self.doc.fields.append(xappy.Field('type', self.type))
        
        for field in self.fields:
            value = getattr(self.resource, field)
            if value:
                self.doc.fields.append(xappy.Field(field, value))

        self.add()
        return self.doc
Exemplo n.º 5
0
 def index(self, doc):    
     # index schema fields
     super(AttachedFileIndexer, self).index(doc)
     if self.context.type.attached_file_type_name == 'document':
         
         if self.context.file_mimetype == 'application/vnd.oasis.opendocument.text':
             doc.fields.append(xappy.Field('doc_text', readODT(self.context.file_data)))
         
         if self.context.file_mimetype == 'application/pdf':
             doc.fields.append(xappy.Field('doc_text', readPDF(self.context.file_data)))
         
         if self.context.file_mimetype == 'text/plain':
             doc.fields.append(xappy.Field('doc_text', self.context.file_data))
Exemplo n.º 6
0
    def index(self, doc):
        # index schema fields
        super(AttachmentIndexer, self).index(doc)
        if self.context.mimetype == 'application/vnd.oasis.opendocument.text':
            doc.fields.append(
                xappy.Field('doc_text', readODT(self.context.data)))

        if self.context.mimetype == 'application/pdf':
            doc.fields.append(
                xappy.Field('doc_text', readPDF(self.context.data)))

        if self.context.mimetype == 'text/plain':
            doc.fields.append(xappy.Field('doc_text', str(self.context.data)))
Exemplo n.º 7
0
    def indexPerson(context, doc):
        " defined as a static method for reuse across multiple types of users and different indexers of them"
        # index first name / last name separately
        first_name, last_name = "", ""
        if context.first_name:
            first_name = context.first_name.encode('utf-8')
            doc.fields.append(xappy.Field('core.person-fname', first_name))
        if context.last_name:
            last_name = context.last_name.encode('utf-8')
            doc.fields.append(xappy.Field('core.person-lname', last_name))

        # index name as display name / title
        doc.fields.append(
            xappy.Field('title', (u"%s %s"%(first_name, last_name)).encode('utf8')))
Exemplo n.º 8
0
    def index(self, doc):
        
        # index schema fields
        super(UserIndexer, self).index(doc)

        # index person attributes
        self.indexPerson(self.context, doc)

        # store email in index
        if self.context.email:
            doc.fields.append(xappy.Field('core.person-email', self.context.email))

        # index active status
        value = (self.context.active_p in ("I", "D", None)) and 'False' or 'True'
        doc.fields.append(xappy.Field('core.active', value))
Exemplo n.º 9
0
def	MakeIndex():

    connection = xappy.IndexerConnection('kis/lib/data')

    cursor = connections['default'].cursor()
    cursor.execute("SELECT rec_id,name FROM t_show_store_eisup_list;")
    data = cursor.fetchall()

    for item in data:
	doc = xappy.UnprocessedDocument()
	doc.fields.append(xappy.Field('kod',item[0].encode('utf-8')))
	doc.fields.append(xappy.Field('name',item[1].encode('utf-8')))
	connection.add(doc)

    connection.flush()
    connection.close()
Exemplo n.º 10
0
    def reindexAll(klass, connection, flush_threshold=500):
        instances = Session().query(klass.domain_model).all()
        resolver = ContentResolver()
        log.warning("Bulk Indexing %r"%klass)
        count = 0
        for i in instances:
            for lang in languages():
                count += 1
                doc_id = resolver.id(i, language=lang.value)
                translated = translation.translate_obj(i, lang.value)
                translated.language = lang.value
                indexer = klass(translated)
                create = False
                doc = indexer.document(connection)
                doc.id = doc_id
                doc.fields.append(xappy.Field('resolver', resolver.scheme))
                #print "*****************"
                #print doc.id
                #print translated.__class__.__name__
                #for field in doc.fields:
                #    print field.name, "=", field.value
                connection.replace(doc)
    
                if count % flush_threshold == 0:
                    log.warning("Flushing %s %s Records"%(flush_threshold, klass))

        # flush the remainder
        connection.flush()
Exemplo n.º 11
0
    def index_files(self, doc, pkg_dict):
        yum_pkg = pkg_dict['pkg']
        if yum_pkg != None:
            desktop_file_cache = RPMCache(yum_pkg, self.yum_base,
                                          self.cache_path)
            desktop_file_cache.open()
            for filename in yum_pkg.filelist:
                if filename.endswith('.desktop'):
                    # index apps
                    print "        indexing desktop file %s" % os.path.basename(
                        filename)
                    f = desktop_file_cache.open_file(
                        filename, decompress_filter='*.desktop')
                    if f == None:
                        print "could not open desktop file"
                        continue

                    self.index_desktop_file(doc, f, pkg_dict,
                                            desktop_file_cache)
                    f.close()
                if filename.startswith('/usr/bin'):
                    # index executables
                    print("        indexing exe file %s" %
                          os.path.basename(filename))
                    exe_name = filter_search_string(os.path.basename(filename))
                    doc.fields.append(
                        xappy.Field('cmd', "EX__%s__EX" % exe_name))

            desktop_file_cache.close()
Exemplo n.º 12
0
 def add_to_index(data):
     doc = xappy.UnprocessedDocument()
     doc.id = data.id
     for k, v in data.items():
         doc.fields.append(xappy.Field(k, v))
     doc = indexer.process(doc)
     doc.data = data
     indexer.replace(doc)
Exemplo n.º 13
0
 def process(self, connection):
     if interfaces.DEBUG_LOG: log.info("Adding %r" % self.document_id)
     instance = self.resolve()
     if not instance or instance == interfaces.OP_REQUEUE:
         return instance
     doc = interfaces.IIndexer(instance).document(connection)
     doc.id = self.document_id
     doc.fields.append(xappy.Field('resolver', self.resolver_id or ''))
     connection.add(doc)
Exemplo n.º 14
0
    def document(self, connection, retry=False):
        """
        return a xapian index document from the context.

        we can introspect the connection to discover relevant fields available.
        """
        doc = xappy.UnprocessedDocument()

        if interfaces.ENABLE_LOGGING:
            log.debug("Indexing Document %r"%self.context)

        # object type
        doc.fields.append(
            xappy.Field("object_type", self.context.__class__.__name__))

        # object kind
        doc.fields.append(
            xappy.Field("object_kind", domain.object_hierarchy_type(self.context)))
        
        # object language
        doc.fields.append(
            xappy.Field("language", self.context.language))
        
        doc.fields.append(xappy.Field("status", getattr(self.context, "status", "")))
        
        doc.fields.append(xappy.Field("owner", str(getattr(self.context, "owner_id", ""))))
        
        try:
            status_date = getattr(self.context, "status_date")
            if status_date:
                status_date = date_value(status_date)
                
            doc.fields.append(xappy.Field("status_date", status_date))
        except Exception:
            pass    
        
        title = ""
        try:
            title = bungeni.ui.search.ISearchResult(self.context).title
        except Exception:
            pass
        
        doc.fields.append(xappy.Field("title", title))
            
        try:
            #TODO: loop thru all available languages and index the translations
            self.index(doc)
            
        except exceptions.OperationalError, exceptions.InvalidRequestError:
            # detatch the dbapi connection from the pool, and close it
            # and retry the index operation (once)
            log.error("Indexing Connection Hosed, Discarding")
            db_connection = metadata.bind.contextual_connect()
            db_connection.begin().rollback()
            db_connection.detach()
            db_connection.close()
            if not retry:
                return self.document(connection, retry=True)
            raise
Exemplo n.º 15
0
 def index_document(self, conn, d):
     if hasattr(self.bench, "process_document_xappy"):
         self.bench.process_document_xappy(d)
     doc = xappy.UnprocessedDocument()
     for key, values in d:
         if not isinstance(values, list):
             values = [values]
         for value in values:
             doc.fields.append(xappy.Field(key, value))
     conn.add(doc)
Exemplo n.º 16
0
 def process(self, connection):
     if interfaces.DEBUG_LOG: log.info("Modifying %r" % self.document_id)
     instance = self.resolve()
     #        if not instance or instance == interfaces.OP_REQUEUE:
     #            return instance
     if ITranslatable.providedBy(instance):
         for lang in languages():
             translated_instance = translation.translate_obj(
                 instance, lang=lang.value)
             translated_instance.language = lang.value
             doc = interfaces.IIndexer(translated_instance).document(
                 connection)
             doc.id = self.get_resolver().id(instance, language=lang.value)
             doc.fields.append(xappy.Field('resolver', self.resolver_id))
             print doc.id
             connection.replace(doc)
     else:
         doc = interfaces.IIndexer(instance).document(connection)
         doc.id = self.document_id
         doc.fields.append(xappy.Field('resolver', self.resolver_id))
         connection.replace(doc)
Exemplo n.º 17
0
    def _add_fields_to_document(self,
                                request,
                                document,
                                fields=None,
                                multivalued_fields=None):

        fields_to_stem = ['title', 'content']

        if fields is None:
            fields = {}
        if multivalued_fields is None:
            multivalued_fields = {}

        for field, value in fields.iteritems():
            document.fields.append(xappy.Field(field, value))
            if field in fields_to_stem:
                document.fields.append(StemmedField(field, value, request))

        for field, values in multivalued_fields.iteritems():
            for value in values:
                document.fields.append(xappy.Field(field, value))
Exemplo n.º 18
0
    def index_desktop_file(self, doc, desktop_file, pkg_dict,
                           desktop_file_cache):
        doc.fields.append(xappy.Field('tag', 'desktop'))

        dp = DesktopParser(desktop_file)
        category = dp.get('Categories', '')

        for c in category.split(';'):
            if c:
                c = filter_search_string(c)
                doc.fields.append(xappy.Field('category_tags', c))
                # add exact match also
                doc.fields.append(
                    xappy.Field('category_tags', "EX__%s__EX" % c))

        icon = dp.get('Icon', '')
        if icon:
            print "Icon %s" % icon
            generated_icon = self.icon_cache.generate_icon(
                icon, desktop_file_cache)
            if generated_icon != None:
                pkg_dict['icon'] = icon
Exemplo n.º 19
0
 def add(self):
 
     hidden = self.resource.hidden and '1' or '0'          
     self.doc.fields.append(xappy.Field('hidden', hidden))
                       
     if hidden == '0':
         title = self.resource.title
         self.append('title', title)
         self.append('sortable_title', sortable(title))
         self.append('searchable_text', self.resource.astext())
         self.append('author', self.resource.book.author)
         self.append('language', self.resource.book.language)
         self.append('genre', self.resource.genre)
Exemplo n.º 20
0
 def add_to_search_index(self, mission, id, chunk, weight, timestamp):
     """
     Take some text and a set of speakers (also text) and add a document
     to the search index, with the id stuffed in the document data.
     """
     lines = chunk['lines']
     doc = xappy.UnprocessedDocument()
     doc.fields.append(xappy.Field("mission", mission))
     doc.fields.append(xappy.Field("weight", weight))
     doc.fields.append(xappy.Field("transcript", self.transcript_name))
     for line in lines:
         text = re.sub(
             r"\[\w+:([^]]+)\|([^]]+)\]",
             lambda m: m.group(2),
             line['text'],
         )
         text = re.sub(
             r"\[\w+:([^]]+)\]",
             lambda m: m.group(1),
             text,
         )
         # also strip tags from text, because they're lame lame lame
         text = strip_tags(text)
         doc.fields.append(xappy.Field("text", text))
         # grab the character to get some more text to index under speaker
         ch = self.characters.get(line['speaker'], None)
         if ch:
             ch2 = ch.current_shift(timestamp)
             doc.fields.append(
                 xappy.Field("speaker_identifier", ch2.identifier))
             doc.fields.append(xappy.Field("speaker", ch2.short_name))
             doc.fields.append(xappy.Field("speaker", ch.short_name))
         else:
             doc.fields.append(
                 xappy.Field("speaker_identifier", line['speaker']))
             doc.fields.append(xappy.Field("speaker", line['speaker']))
     doc.id = id
     try:
         search_db.replace(search_db.process(doc))
     except xappy.errors.IndexerError:
         print "umm, error"
         print id, lines
         raise
Exemplo n.º 21
0
    def index_tags(self, doc, pkg):
        if not self.tagger_cache:
            return

        name = pkg['name']
        tags = self.tagger_cache.get(name, [])
        for tag_info in tags:
            tag_name = tag_info['tag']
            total = tag_info['total']
            if total > 0:
                print "    adding '%s' tag (%d)" % (tag_name.encode('utf-8'),
                                                    total)
            for i in range(total):
                doc.fields.append(xappy.Field('tag', tag_name))
Exemplo n.º 22
0
 def index_tags(self, doc, package):
     name = package['name']
     response = local.http.get(self.tagger_url + '/api/v1/' + name)
     if not bool(response):
         log.warn("Failed to get tagger info for %r, %r" % (name, response))
         return
     tags = response.json()['tags']
     for tag_info in tags:
         tag_name = tag_info['tag']
         total = tag_info['total']
         if total > 0:
             log.debug("    adding '%s' tag (%d)" %
                       (tag_name.encode('utf-8'), total))
         for i in range(total):
             doc.fields.append(xappy.Field('tag', tag_name))
 def update_timestamp(self, timestamp):
     doc = self.get_timestamp_doc()
     if doc:
         doc._doc.set_data(str(timestamp))
         self.iconn.replace(doc)
         self.iconn.flush()
     else:
         doc = xappy.UnprocessedDocument()
         doc.fields.append(xappy.Field('key', '_last_run_'))
         processed_doc = self.iconn.process(doc, False)
         processed_doc._doc.set_data(str(timestamp))
         # preempt xappy's processing of data
         processed_doc._data = None
         self.iconn.add(processed_doc)
         self.iconn.flush()
Exemplo n.º 24
0
 def _factory(db, doc):
     ixdoc = xappy.UnprocessedDocument()
     ixdoc.id = doc['_id']
     for D in i:
         for data in D['data']:
             data, num_items = expand(data, doc)
             for n in xrange(num_items):
                 if 'factory' in D:
                     out = import_func(D['factory'])(doc)
                     if isinstance(out, ListType):
                         for index_text in out:
                             print 'INDEX_TEXT', index_text
                             ixdoc.fields.append(
                                 xappy.Field(D['name'], index_text))
                     else:
                         index_text = out
                         print 'INDEX_TEXT', index_text
                         ixdoc.fields.append(
                             xappy.Field(D['name'], index_text))
                 else:
                     index_text = (data % {'n': n}) % api.dotted(doc)
                     print 'INDEX_TEXT', index_text
                     ixdoc.fields.append(xappy.Field(D['name'], index_text))
     return ixdoc
Exemplo n.º 25
0
    def index(self, doc):
        " populate a xapian document with fields to be indexed from context "
        # create index of all text fields for the document
        for field_index_name, field in self.fields():
            if not isinstance(field, (schema.Text, schema.ASCII)):
                continue
            value = field.query(self.context, '')
            if value is None:
                value = u''

            if not isinstance(value, basestring):
                value = unicode(value)

            #if interfaces.ENABLE_LOGGING:
            #    log.debug("  field %s as %s, %r"%(field.__name__, index_field_name, value))

            doc.fields.append(xappy.Field(field_index_name, value))
Exemplo n.º 26
0
 def reindexAll(klass, connection, flush_threshold=500):
     instances = Session().query(klass.domain_model).all()
     resolver = ContentResolver()
     log.warning("Bulk Indexing %r"%klass)
     count = 0
     for i in instances:
         count += 1
         doc_id = resolver.id(i)
         indexer = klass(i)
         create = False
         doc = indexer.document(connection)
         doc.id = doc_id
         doc.fields.append(xappy.Field('resolver', resolver.scheme))
         connection.replace(doc)
         if count % flush_threshold == 0:
             log.warning("Flushing %s %s Records"%(flush_threshold, klass))
     # flush the remainder
     connection.flush()
Exemplo n.º 27
0
    def index_files_of_interest(self, doc, package_dict):
        name = package_dict['name']
        branch = package_dict['branch']

        if branch == 'master':
            branch = 'rawhide'

        url = "/".join([self.mdapi_url, branch, "files", name])
        response = local.http.get(url)
        if not bool(response):
            log.warn("Failed to get file list for %r, %r" % (name, response))
            return
        data = response.json()
        for entry in data['files']:
            filenames = entry['filenames'].split('/')
            for filename in filenames:
                if filename.startswith('/usr/bin'):
                    # index executables
                    log.info("        indexing exe file %s" %
                             os.path.basename(filename))
                    exe_name = filter_search_string(os.path.basename(filename))
                    doc.fields.append(
                        xappy.Field('cmd', "EX__%s__EX" % exe_name))
Exemplo n.º 28
0
    def _create_document(self, package):
        doc = xappy.UnprocessedDocument()
        filtered_name = filter_search_string(package['name'])
        filtered_summary = filter_search_string(package['summary'])
        filtered_description = filter_search_string(package['description'])

        doc.fields.append(
            xappy.Field('exact_name',
                        'EX__' + filtered_name + '__EX',
                        weight=10.0))

        name_parts = filtered_name.split('_')
        for i in range(20):
            if len(name_parts) > 1:
                for part in name_parts:
                    doc.fields.append(xappy.Field('name', part, weight=1.0))
            doc.fields.append(xappy.Field('name', filtered_name, weight=10.0))

        for i in range(4):
            doc.fields.append(
                xappy.Field('summary', filtered_summary, weight=1.0))
        doc.fields.append(
            xappy.Field('description', filtered_description, weight=0.2))

        self.index_files_of_interest(doc, package)
        self.index_tags(doc, package)

        for sub_package in package['sub_pkgs']:
            filtered_sub_package_name = filter_search_string(
                sub_package['name'])
            log.info("       indexing subpackage %s" % sub_package['name'])

            doc.fields.append(
                xappy.Field('subpackages',
                            filtered_sub_package_name,
                            weight=1.0))
            doc.fields.append(
                xappy.Field('exact_name',
                            'EX__' + filtered_sub_package_name + '__EX',
                            weight=10.0))

            self.index_files_of_interest(doc, sub_package)

            # fedora-tagger does not provide special tags for sub-packages...
            #self.index_tags(doc, sub_package)

            # Set special sub-package icon if appstream has one
            sub_package['icon'] = self.icon_cache.get(sub_package['name'],
                                                      self.default_icon)

            # If the parent has a dull icon, give it ours!
            if sub_package['icon'] != self.default_icon \
                and package['icon'] == self.default_icon:
                package['icon'] = sub_package['icon']

            # remove anything we don't want to store
            del sub_package['package']

        # @@: Right now we're only indexing the first part of the
        # provides/requires, and not boolean comparison or version
        #for requires in package.requires:
        #    print requires[0]
        #    doc.fields.append(xappy.Field('requires', requires[0]))
        #for provides in package.provides:
        #    doc.fields.append(xappy.Field('provides', provides[0]))

        # remove anything we don't want to store and then store data in
        # json format
        del package['package']

        return doc
Exemplo n.º 29
0
 def append(self, field, value):
     if value:
         self.doc.fields.append(xappy.Field(field, value))
Exemplo n.º 30
0
    def index_pkgs(self):
        yum_pkgs = self.index_yum_pkgs()
        pkg_count = 0

        for pkg in yum_pkgs.values():
            pkg_count += 1

            doc = xappy.UnprocessedDocument()
            filtered_name = filter_search_string(pkg['name'])
            filtered_summary = filter_search_string(pkg['summary'])
            filtered_description = filter_search_string(pkg['description'])

            if pkg['name'] != filtered_name:
                print("%d: indexing %s as %s" %
                      (pkg_count, pkg['name'], filtered_name))
            else:
                print("%d: indexing %s" % (pkg_count, pkg['name']))

            doc.fields.append(
                xappy.Field('exact_name',
                            'EX__' + filtered_name + '__EX',
                            weight=10.0))

            name_parts = filtered_name.split('_')
            for i in range(20):
                if len(name_parts) > 1:
                    for part in name_parts:
                        doc.fields.append(xappy.Field('name', part,
                                                      weight=1.0))
                doc.fields.append(
                    xappy.Field('name', filtered_name, weight=10.0))

            for i in range(4):
                doc.fields.append(
                    xappy.Field('summary', filtered_summary, weight=1.0))
            doc.fields.append(
                xappy.Field('description', filtered_description, weight=0.2))

            self.index_files(doc, pkg)
            self.index_tags(doc, pkg)

            for sub_pkg in pkg['sub_pkgs']:
                pkg_count += 1
                filtered_sub_pkg_name = filter_search_string(sub_pkg['name'])
                if filtered_sub_pkg_name != sub_pkg['name']:
                    print("%d:    indexing subpkg %s as %s" %
                          (pkg_count, sub_pkg['name'], filtered_sub_pkg_name))
                else:
                    print("%d:    indexing subpkg %s" %
                          (pkg_count, sub_pkg['name']))

                doc.fields.append(
                    xappy.Field('subpackages',
                                filtered_sub_pkg_name,
                                weight=1.0))
                doc.fields.append(
                    xappy.Field('exact_name',
                                'EX__' + filtered_sub_pkg_name + '__EX',
                                weight=10.0))

                self.index_files(doc, sub_pkg)
                self.index_tags(doc, sub_pkg)
                if sub_pkg['icon'] != self.default_icon and pkg[
                        'icon'] == self.default_icon:
                    pkg['icon'] = sub_pkg['icon']

                # remove anything we don't want to store
                del sub_pkg['pkg']

            # @@: Right now we're only indexing the first part of the
            # provides/requires, and not boolean comparison or version
            #for requires in pkg.requires:
            #    print requires[0]
            #    doc.fields.append(xappy.Field('requires', requires[0]))
            #for provides in pkg.provides:
            #    doc.fields.append(xappy.Field('provides', provides[0]))

            # remove anything we don't want to store and then store data in
            # json format
            del pkg['pkg']
            del pkg['src_pkg']

            processed_doc = self.iconn.process(doc, False)
            processed_doc._doc.set_data(json.dumps(pkg))
            # preempt xappy's processing of data
            processed_doc._data = None
            self.iconn.add(processed_doc)

        self.icon_cache.close()

        return pkg_count