def build_index(self, remove_old=True): if remove_old: remove_directory(self.search_db_dir) self.__xappy = xappy.IndexerConnection(self.search_db_dir) self.__xappy.add_field_action("module_uid", xappy.FieldActions.STORE_CONTENT) self.__xappy.add_field_action("keyword_term", xappy.FieldActions.INDEX_FREETEXT, nopos=True) for module_keyword in self.__keywords: for keyword in module_keyword[2]: module_doc = xappy.UnprocessedDocument() module_doc.fields.append(xappy.Field("module_uid", keyword[0])) terms = list(split_word(keyword[1], True)) module_doc.fields.append( xappy.Field("keyword_term", ' '.join(terms))) self.__xappy.add(module_doc) self.__xappy.close()
def document(self, connection, retry=False): """ return a xapian index document from the context. we can introspect the connection to discover relevant fields available. """ doc = xappy.UnprocessedDocument() if interfaces.ENABLE_LOGGING: log.debug("Indexing Document %r" % self.context) # object type doc.fields.append( xappy.Field("object_type", self.context.__class__.__name__)) # object kind doc.fields.append( xappy.Field("object_kind", domain.object_hierarchy_type(self.context))) try: #TODO: loop thru all available languages and index the translations self.index(doc) except exceptions.OperationalError, exceptions.InvalidRequestError: # detatch the dbapi connection from the pool, and close it # and retry the index operation (once) log.error("Indexing Connection Hosed, Discarding") db_connection = metadata.bind.contextual_connect() db_connection.begin().rollback() db_connection.detach() db_connection.close() if not retry: return self.document(connection, retry=True) raise
def __buildDoc(self, article): if article.getTitle() == None: return None doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field("title", article.getTitle())) if article.getAbstract() == None: pass else: doc.fields.append(xappy.Field("text", article.getAbstract())) #'INDEX_EXACT' - maximum length 220, but prefix "XA" is added to each term in the document #maximum length 218 for chemical in [ chemical for chemical in article.getChemicals() if len(chemical) < 219 ]: doc.fields.append(xappy.Field("chemical_exact", chemical)) for keyword in article.getKeywords(): doc.fields.append(xappy.Field("keyword", keyword)) for mesh in article.getMeSH(): doc.fields.append(xappy.Field("mesh", mesh)) doc.id = str(article.getPMID()) return doc
def document(self): self.doc.id = self.resource.url self.doc.fields.append(xappy.Field('type', self.type)) for field in self.fields: value = getattr(self.resource, field) if value: self.doc.fields.append(xappy.Field(field, value)) self.add() return self.doc
def index(self, doc): # index schema fields super(AttachedFileIndexer, self).index(doc) if self.context.type.attached_file_type_name == 'document': if self.context.file_mimetype == 'application/vnd.oasis.opendocument.text': doc.fields.append(xappy.Field('doc_text', readODT(self.context.file_data))) if self.context.file_mimetype == 'application/pdf': doc.fields.append(xappy.Field('doc_text', readPDF(self.context.file_data))) if self.context.file_mimetype == 'text/plain': doc.fields.append(xappy.Field('doc_text', self.context.file_data))
def index(self, doc): # index schema fields super(AttachmentIndexer, self).index(doc) if self.context.mimetype == 'application/vnd.oasis.opendocument.text': doc.fields.append( xappy.Field('doc_text', readODT(self.context.data))) if self.context.mimetype == 'application/pdf': doc.fields.append( xappy.Field('doc_text', readPDF(self.context.data))) if self.context.mimetype == 'text/plain': doc.fields.append(xappy.Field('doc_text', str(self.context.data)))
def indexPerson(context, doc): " defined as a static method for reuse across multiple types of users and different indexers of them" # index first name / last name separately first_name, last_name = "", "" if context.first_name: first_name = context.first_name.encode('utf-8') doc.fields.append(xappy.Field('core.person-fname', first_name)) if context.last_name: last_name = context.last_name.encode('utf-8') doc.fields.append(xappy.Field('core.person-lname', last_name)) # index name as display name / title doc.fields.append( xappy.Field('title', (u"%s %s"%(first_name, last_name)).encode('utf8')))
def index(self, doc): # index schema fields super(UserIndexer, self).index(doc) # index person attributes self.indexPerson(self.context, doc) # store email in index if self.context.email: doc.fields.append(xappy.Field('core.person-email', self.context.email)) # index active status value = (self.context.active_p in ("I", "D", None)) and 'False' or 'True' doc.fields.append(xappy.Field('core.active', value))
def MakeIndex(): connection = xappy.IndexerConnection('kis/lib/data') cursor = connections['default'].cursor() cursor.execute("SELECT rec_id,name FROM t_show_store_eisup_list;") data = cursor.fetchall() for item in data: doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('kod',item[0].encode('utf-8'))) doc.fields.append(xappy.Field('name',item[1].encode('utf-8'))) connection.add(doc) connection.flush() connection.close()
def reindexAll(klass, connection, flush_threshold=500): instances = Session().query(klass.domain_model).all() resolver = ContentResolver() log.warning("Bulk Indexing %r"%klass) count = 0 for i in instances: for lang in languages(): count += 1 doc_id = resolver.id(i, language=lang.value) translated = translation.translate_obj(i, lang.value) translated.language = lang.value indexer = klass(translated) create = False doc = indexer.document(connection) doc.id = doc_id doc.fields.append(xappy.Field('resolver', resolver.scheme)) #print "*****************" #print doc.id #print translated.__class__.__name__ #for field in doc.fields: # print field.name, "=", field.value connection.replace(doc) if count % flush_threshold == 0: log.warning("Flushing %s %s Records"%(flush_threshold, klass)) # flush the remainder connection.flush()
def index_files(self, doc, pkg_dict): yum_pkg = pkg_dict['pkg'] if yum_pkg != None: desktop_file_cache = RPMCache(yum_pkg, self.yum_base, self.cache_path) desktop_file_cache.open() for filename in yum_pkg.filelist: if filename.endswith('.desktop'): # index apps print " indexing desktop file %s" % os.path.basename( filename) f = desktop_file_cache.open_file( filename, decompress_filter='*.desktop') if f == None: print "could not open desktop file" continue self.index_desktop_file(doc, f, pkg_dict, desktop_file_cache) f.close() if filename.startswith('/usr/bin'): # index executables print(" indexing exe file %s" % os.path.basename(filename)) exe_name = filter_search_string(os.path.basename(filename)) doc.fields.append( xappy.Field('cmd', "EX__%s__EX" % exe_name)) desktop_file_cache.close()
def add_to_index(data): doc = xappy.UnprocessedDocument() doc.id = data.id for k, v in data.items(): doc.fields.append(xappy.Field(k, v)) doc = indexer.process(doc) doc.data = data indexer.replace(doc)
def process(self, connection): if interfaces.DEBUG_LOG: log.info("Adding %r" % self.document_id) instance = self.resolve() if not instance or instance == interfaces.OP_REQUEUE: return instance doc = interfaces.IIndexer(instance).document(connection) doc.id = self.document_id doc.fields.append(xappy.Field('resolver', self.resolver_id or '')) connection.add(doc)
def document(self, connection, retry=False): """ return a xapian index document from the context. we can introspect the connection to discover relevant fields available. """ doc = xappy.UnprocessedDocument() if interfaces.ENABLE_LOGGING: log.debug("Indexing Document %r"%self.context) # object type doc.fields.append( xappy.Field("object_type", self.context.__class__.__name__)) # object kind doc.fields.append( xappy.Field("object_kind", domain.object_hierarchy_type(self.context))) # object language doc.fields.append( xappy.Field("language", self.context.language)) doc.fields.append(xappy.Field("status", getattr(self.context, "status", ""))) doc.fields.append(xappy.Field("owner", str(getattr(self.context, "owner_id", "")))) try: status_date = getattr(self.context, "status_date") if status_date: status_date = date_value(status_date) doc.fields.append(xappy.Field("status_date", status_date)) except Exception: pass title = "" try: title = bungeni.ui.search.ISearchResult(self.context).title except Exception: pass doc.fields.append(xappy.Field("title", title)) try: #TODO: loop thru all available languages and index the translations self.index(doc) except exceptions.OperationalError, exceptions.InvalidRequestError: # detatch the dbapi connection from the pool, and close it # and retry the index operation (once) log.error("Indexing Connection Hosed, Discarding") db_connection = metadata.bind.contextual_connect() db_connection.begin().rollback() db_connection.detach() db_connection.close() if not retry: return self.document(connection, retry=True) raise
def index_document(self, conn, d): if hasattr(self.bench, "process_document_xappy"): self.bench.process_document_xappy(d) doc = xappy.UnprocessedDocument() for key, values in d: if not isinstance(values, list): values = [values] for value in values: doc.fields.append(xappy.Field(key, value)) conn.add(doc)
def process(self, connection): if interfaces.DEBUG_LOG: log.info("Modifying %r" % self.document_id) instance = self.resolve() # if not instance or instance == interfaces.OP_REQUEUE: # return instance if ITranslatable.providedBy(instance): for lang in languages(): translated_instance = translation.translate_obj( instance, lang=lang.value) translated_instance.language = lang.value doc = interfaces.IIndexer(translated_instance).document( connection) doc.id = self.get_resolver().id(instance, language=lang.value) doc.fields.append(xappy.Field('resolver', self.resolver_id)) print doc.id connection.replace(doc) else: doc = interfaces.IIndexer(instance).document(connection) doc.id = self.document_id doc.fields.append(xappy.Field('resolver', self.resolver_id)) connection.replace(doc)
def _add_fields_to_document(self, request, document, fields=None, multivalued_fields=None): fields_to_stem = ['title', 'content'] if fields is None: fields = {} if multivalued_fields is None: multivalued_fields = {} for field, value in fields.iteritems(): document.fields.append(xappy.Field(field, value)) if field in fields_to_stem: document.fields.append(StemmedField(field, value, request)) for field, values in multivalued_fields.iteritems(): for value in values: document.fields.append(xappy.Field(field, value))
def index_desktop_file(self, doc, desktop_file, pkg_dict, desktop_file_cache): doc.fields.append(xappy.Field('tag', 'desktop')) dp = DesktopParser(desktop_file) category = dp.get('Categories', '') for c in category.split(';'): if c: c = filter_search_string(c) doc.fields.append(xappy.Field('category_tags', c)) # add exact match also doc.fields.append( xappy.Field('category_tags', "EX__%s__EX" % c)) icon = dp.get('Icon', '') if icon: print "Icon %s" % icon generated_icon = self.icon_cache.generate_icon( icon, desktop_file_cache) if generated_icon != None: pkg_dict['icon'] = icon
def add(self): hidden = self.resource.hidden and '1' or '0' self.doc.fields.append(xappy.Field('hidden', hidden)) if hidden == '0': title = self.resource.title self.append('title', title) self.append('sortable_title', sortable(title)) self.append('searchable_text', self.resource.astext()) self.append('author', self.resource.book.author) self.append('language', self.resource.book.language) self.append('genre', self.resource.genre)
def add_to_search_index(self, mission, id, chunk, weight, timestamp): """ Take some text and a set of speakers (also text) and add a document to the search index, with the id stuffed in the document data. """ lines = chunk['lines'] doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field("mission", mission)) doc.fields.append(xappy.Field("weight", weight)) doc.fields.append(xappy.Field("transcript", self.transcript_name)) for line in lines: text = re.sub( r"\[\w+:([^]]+)\|([^]]+)\]", lambda m: m.group(2), line['text'], ) text = re.sub( r"\[\w+:([^]]+)\]", lambda m: m.group(1), text, ) # also strip tags from text, because they're lame lame lame text = strip_tags(text) doc.fields.append(xappy.Field("text", text)) # grab the character to get some more text to index under speaker ch = self.characters.get(line['speaker'], None) if ch: ch2 = ch.current_shift(timestamp) doc.fields.append( xappy.Field("speaker_identifier", ch2.identifier)) doc.fields.append(xappy.Field("speaker", ch2.short_name)) doc.fields.append(xappy.Field("speaker", ch.short_name)) else: doc.fields.append( xappy.Field("speaker_identifier", line['speaker'])) doc.fields.append(xappy.Field("speaker", line['speaker'])) doc.id = id try: search_db.replace(search_db.process(doc)) except xappy.errors.IndexerError: print "umm, error" print id, lines raise
def index_tags(self, doc, pkg): if not self.tagger_cache: return name = pkg['name'] tags = self.tagger_cache.get(name, []) for tag_info in tags: tag_name = tag_info['tag'] total = tag_info['total'] if total > 0: print " adding '%s' tag (%d)" % (tag_name.encode('utf-8'), total) for i in range(total): doc.fields.append(xappy.Field('tag', tag_name))
def index_tags(self, doc, package): name = package['name'] response = local.http.get(self.tagger_url + '/api/v1/' + name) if not bool(response): log.warn("Failed to get tagger info for %r, %r" % (name, response)) return tags = response.json()['tags'] for tag_info in tags: tag_name = tag_info['tag'] total = tag_info['total'] if total > 0: log.debug(" adding '%s' tag (%d)" % (tag_name.encode('utf-8'), total)) for i in range(total): doc.fields.append(xappy.Field('tag', tag_name))
def update_timestamp(self, timestamp): doc = self.get_timestamp_doc() if doc: doc._doc.set_data(str(timestamp)) self.iconn.replace(doc) self.iconn.flush() else: doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('key', '_last_run_')) processed_doc = self.iconn.process(doc, False) processed_doc._doc.set_data(str(timestamp)) # preempt xappy's processing of data processed_doc._data = None self.iconn.add(processed_doc) self.iconn.flush()
def _factory(db, doc): ixdoc = xappy.UnprocessedDocument() ixdoc.id = doc['_id'] for D in i: for data in D['data']: data, num_items = expand(data, doc) for n in xrange(num_items): if 'factory' in D: out = import_func(D['factory'])(doc) if isinstance(out, ListType): for index_text in out: print 'INDEX_TEXT', index_text ixdoc.fields.append( xappy.Field(D['name'], index_text)) else: index_text = out print 'INDEX_TEXT', index_text ixdoc.fields.append( xappy.Field(D['name'], index_text)) else: index_text = (data % {'n': n}) % api.dotted(doc) print 'INDEX_TEXT', index_text ixdoc.fields.append(xappy.Field(D['name'], index_text)) return ixdoc
def index(self, doc): " populate a xapian document with fields to be indexed from context " # create index of all text fields for the document for field_index_name, field in self.fields(): if not isinstance(field, (schema.Text, schema.ASCII)): continue value = field.query(self.context, '') if value is None: value = u'' if not isinstance(value, basestring): value = unicode(value) #if interfaces.ENABLE_LOGGING: # log.debug(" field %s as %s, %r"%(field.__name__, index_field_name, value)) doc.fields.append(xappy.Field(field_index_name, value))
def reindexAll(klass, connection, flush_threshold=500): instances = Session().query(klass.domain_model).all() resolver = ContentResolver() log.warning("Bulk Indexing %r"%klass) count = 0 for i in instances: count += 1 doc_id = resolver.id(i) indexer = klass(i) create = False doc = indexer.document(connection) doc.id = doc_id doc.fields.append(xappy.Field('resolver', resolver.scheme)) connection.replace(doc) if count % flush_threshold == 0: log.warning("Flushing %s %s Records"%(flush_threshold, klass)) # flush the remainder connection.flush()
def index_files_of_interest(self, doc, package_dict): name = package_dict['name'] branch = package_dict['branch'] if branch == 'master': branch = 'rawhide' url = "/".join([self.mdapi_url, branch, "files", name]) response = local.http.get(url) if not bool(response): log.warn("Failed to get file list for %r, %r" % (name, response)) return data = response.json() for entry in data['files']: filenames = entry['filenames'].split('/') for filename in filenames: if filename.startswith('/usr/bin'): # index executables log.info(" indexing exe file %s" % os.path.basename(filename)) exe_name = filter_search_string(os.path.basename(filename)) doc.fields.append( xappy.Field('cmd', "EX__%s__EX" % exe_name))
def _create_document(self, package): doc = xappy.UnprocessedDocument() filtered_name = filter_search_string(package['name']) filtered_summary = filter_search_string(package['summary']) filtered_description = filter_search_string(package['description']) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_name + '__EX', weight=10.0)) name_parts = filtered_name.split('_') for i in range(20): if len(name_parts) > 1: for part in name_parts: doc.fields.append(xappy.Field('name', part, weight=1.0)) doc.fields.append(xappy.Field('name', filtered_name, weight=10.0)) for i in range(4): doc.fields.append( xappy.Field('summary', filtered_summary, weight=1.0)) doc.fields.append( xappy.Field('description', filtered_description, weight=0.2)) self.index_files_of_interest(doc, package) self.index_tags(doc, package) for sub_package in package['sub_pkgs']: filtered_sub_package_name = filter_search_string( sub_package['name']) log.info(" indexing subpackage %s" % sub_package['name']) doc.fields.append( xappy.Field('subpackages', filtered_sub_package_name, weight=1.0)) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_sub_package_name + '__EX', weight=10.0)) self.index_files_of_interest(doc, sub_package) # fedora-tagger does not provide special tags for sub-packages... #self.index_tags(doc, sub_package) # Set special sub-package icon if appstream has one sub_package['icon'] = self.icon_cache.get(sub_package['name'], self.default_icon) # If the parent has a dull icon, give it ours! if sub_package['icon'] != self.default_icon \ and package['icon'] == self.default_icon: package['icon'] = sub_package['icon'] # remove anything we don't want to store del sub_package['package'] # @@: Right now we're only indexing the first part of the # provides/requires, and not boolean comparison or version #for requires in package.requires: # print requires[0] # doc.fields.append(xappy.Field('requires', requires[0])) #for provides in package.provides: # doc.fields.append(xappy.Field('provides', provides[0])) # remove anything we don't want to store and then store data in # json format del package['package'] return doc
def append(self, field, value): if value: self.doc.fields.append(xappy.Field(field, value))
def index_pkgs(self): yum_pkgs = self.index_yum_pkgs() pkg_count = 0 for pkg in yum_pkgs.values(): pkg_count += 1 doc = xappy.UnprocessedDocument() filtered_name = filter_search_string(pkg['name']) filtered_summary = filter_search_string(pkg['summary']) filtered_description = filter_search_string(pkg['description']) if pkg['name'] != filtered_name: print("%d: indexing %s as %s" % (pkg_count, pkg['name'], filtered_name)) else: print("%d: indexing %s" % (pkg_count, pkg['name'])) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_name + '__EX', weight=10.0)) name_parts = filtered_name.split('_') for i in range(20): if len(name_parts) > 1: for part in name_parts: doc.fields.append(xappy.Field('name', part, weight=1.0)) doc.fields.append( xappy.Field('name', filtered_name, weight=10.0)) for i in range(4): doc.fields.append( xappy.Field('summary', filtered_summary, weight=1.0)) doc.fields.append( xappy.Field('description', filtered_description, weight=0.2)) self.index_files(doc, pkg) self.index_tags(doc, pkg) for sub_pkg in pkg['sub_pkgs']: pkg_count += 1 filtered_sub_pkg_name = filter_search_string(sub_pkg['name']) if filtered_sub_pkg_name != sub_pkg['name']: print("%d: indexing subpkg %s as %s" % (pkg_count, sub_pkg['name'], filtered_sub_pkg_name)) else: print("%d: indexing subpkg %s" % (pkg_count, sub_pkg['name'])) doc.fields.append( xappy.Field('subpackages', filtered_sub_pkg_name, weight=1.0)) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_sub_pkg_name + '__EX', weight=10.0)) self.index_files(doc, sub_pkg) self.index_tags(doc, sub_pkg) if sub_pkg['icon'] != self.default_icon and pkg[ 'icon'] == self.default_icon: pkg['icon'] = sub_pkg['icon'] # remove anything we don't want to store del sub_pkg['pkg'] # @@: Right now we're only indexing the first part of the # provides/requires, and not boolean comparison or version #for requires in pkg.requires: # print requires[0] # doc.fields.append(xappy.Field('requires', requires[0])) #for provides in pkg.provides: # doc.fields.append(xappy.Field('provides', provides[0])) # remove anything we don't want to store and then store data in # json format del pkg['pkg'] del pkg['src_pkg'] processed_doc = self.iconn.process(doc, False) processed_doc._doc.set_data(json.dumps(pkg)) # preempt xappy's processing of data processed_doc._data = None self.iconn.add(processed_doc) self.icon_cache.close() return pkg_count