def index(datapath): db = xapian.WritableDatabase("./newdb2/", xapian.DB_CREATE_OR_OPEN) # Set up a TermGenerator that we'll use in indexing. termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("pt")) print(datapath) for fields in parse_csv_file(datapath): # 'fields' is a dictionary mapping from field name to value. # Pick out the fields we're going to index. docid = fields.get('DOCID', u'') date = fields.get('DATE', u'') text = fields.get('TEXT', u'') doc = xapian.Document() termgenerator.set_document(doc) termgenerator.index_text(text, 1, 'XD') termgenerator.index_text(text) doc.set_data(docid + ": " + text) idterm = u"Q" + docid doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def index(datapath, dbpath): db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in parse_csv_file(datapath): title = fields.get('TITLE', u'') body = fields.get('BODY', u'') textClass = fields.get('CLASS', u'') identifier = fields.get('ID', u'') print '{}'.format(title) print '{}'.format(body) doc = xapian.Document() termgenerator.set_document(doc) termgenerator.index_text(textClass, 1, 'C') termgenerator.index_text(body, 1, 'B') termgenerator.index_text(identifier, 1, 'I') termgenerator.index_text(textClass) termgenerator.increase_termpos() termgenerator.index_text(body) termgenerator.increase_termpos() termgenerator.index_text(identifier) doc.set_data(json.dumps(fields, ensure_ascii=False, encoding="utf-8")) idterm = u"Q" + identifier doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def index(): PATH = '/home/bone/data/thesis/data/mag' dbpath = PATH + '/authors.xpn' datapath = PATH + '/Authors.txt.gz' # Create or open the database we're going to be writing to. db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) # Set up a TermGenerator that we'll use in indexing. termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in generate_json_dict(Authors, read_gzip_lines(datapath, 'utf-8')): # 'fields' is a dictionary mapping from field name to value. # Pick out the fields we're going to index. title = fields.get('DisplayName', u'') identifier = fields.get('AuthorId', u'') # We make a document and tell the term generator to use this. doc = xapian.Document() # Store all the fields for display purposes. doc.set_data(title) # We use the identifier to ensure each object ends up in the # database only once no matter how many times we run the # indexer. idterm = identifier doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def test_userstem(): mystem = MyStemmer() stem = xapian.Stem(mystem) expect(stem('test'), 'tst') stem2 = xapian.Stem(mystem) expect(stem2('toastie'), 'tst') indexer = xapian.TermGenerator() indexer.set_stemmer(xapian.Stem(MyStemmer())) doc = xapian.Document() indexer.set_document(doc) indexer.index_text('hello world') s = '/' for t in doc.termlist(): s += t.term s += '/' expect(s, '/Zhll/Zwrld/hello/world/') parser = xapian.QueryParser() parser.set_stemmer(xapian.Stem(MyStemmer())) parser.set_stemming_strategy(xapian.QueryParser.STEM_ALL) expect_query(parser.parse_query('color television'), '(clr:(pos=1) OR tlvsn:(pos=2))')
def index_extension(extension): if extension.latest_version is None: return db = xapian.WritableDatabase(settings.XAPIAN_DB_PATH, xapian.DB_CREATE_OR_OPEN) termgen = xapian.TermGenerator() termgen.set_stemmer(xapian.Stem("en")) doc = xapian.Document() termgen.set_document(doc) termgen.index_text(extension.name, 10) termgen.index_text(extension.uuid) termgen.index_text(extension.description) doc.set_data(str(extension.pk)) idterm = "Q%s" % (extension.pk, ) doc.add_boolean_term(idterm) for shell_version in extension.visible_shell_version_map.iterkeys(): doc.add_boolean_term("V%s" % (shell_version, )) db.replace_document(idterm, doc)
def index_model(dbs_root, model_name, model): db_path = os.path.join(dbs_root, model_name) db = xapian.WritableDatabase(db_path, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() indexer.set_stemmer(xapian.Stem('english')) (schema, keyname) = (model.search_schema, model.search_key) for obj in model.objects.all(): if hasattr(obj, 'active'): active = getattr(obj, 'active') if not active: continue key = str(getattr(obj, keyname)) doc = xapian.Document() doc.set_data(key) indexer.set_document(doc) for field in schema: do_camel_case = False if field[0] == '^': do_camel_case = True field = field[1:] value = getattr(obj, field) if not value: continue if do_camel_case: index_camel_case(value, indexer) else: indexer.index_text(value) db.add_document(doc)
def init_indexing(self, changed=[]): ensuredir(self.db_path) self.database = xapian.WritableDatabase(self.db_path, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() stemmer = xapian.Stem("english") self.indexer.set_stemmer(stemmer)
def begin_index_file(self, filepath): # Initialize indexer self.indexer = xapian.TermGenerator() # Set word stemmer to English self.indexer.set_stemmer(xapian.Stem('english')) self.indexer.set_database(self.index) self.indexer.set_flags(xapian.TermGenerator.FLAG_SPELLING)
def _build_index(self, filepath, recreate=False): """ save txt to LevelDB Input: - filepath: txt file path, support .gzip, .bzip2, and .txt file - recreate: bool, True will force recreate db, default is False """ cached_index = filepath + ".index" if os.path.exists(cached_index): if recreate: shutil.rmtree(cached_index) else: recreate = True stemmer = xapian.Stem("english") if not recreate: database = xapian.Database(cached_index) else: database = xapian.WritableDatabase(cached_index, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() indexer.set_stemmer(stemmer) ext = os.path.splitext(filepath)[-1] if ext == ".bz2": import bz2 open_func = bz2.open elif ext == ".gz": import gzip open_func = gzip.open else: open_func = open with open_func(filepath, mode="rt", encoding="utf-8") as f: totN, totP, totS = 0, 0, 0 for l in tqdm(f, desc="Building index", unit=" lines"): l = l.strip() if len(l) < 1: if totS > 0: totP += 1 totS = 0 continue for sent in nltk.sent_tokenize(l): sent.strip() doc = xapian.Document() doc.set_data(sent) indexer.set_document(doc) indexer.index_text(sent) database.add_document(doc) totN += 1 totS += 1 self.parser = xapian.QueryParser() self.parser.set_stemmer(stemmer) self.parser.set_database(database) self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.enquire = xapian.Enquire(database)
def reload_database(self): # {{{ ''' reload the database. ''' # create the xapian handlers self.database_handle = xapian.WritableDatabase( self.database, xapian.DB_CREATE_OR_OPEN) self.query_parser = xapian.QueryParser() # needed for incremental search self.query_parser.set_database(self.database_handle) self.query_parser.set_stemmer(xapian.Stem(self.language)) self.query_parser.set_stemming_strategy(self.query_parser.STEM_SOME) self.query_parser.add_prefix("title", "S") self.term_generator = xapian.TermGenerator() self.term_generator.set_stemmer(xapian.Stem(self.language)) try: self.term_generator.set_stemming_strategy( self.term_generator.STEM_SOME) except AttributeError: pass self.enquire = xapian.Enquire(self.database_handle) self.sorted_e = xapian.Enquire(self.database_handle) # Value 2 is the lowercase form of the title self.sorted_e.set_sort_by_value(2, False)
def add_new_entry(db, sentence_info, title, docid): title_tokens = title.replace('_', ' ') # make a new document. x_doc = xapian.Document() # setup indexer indexer = xapian.TermGenerator() indexer.set_stemmer(xapian.Stem("english")) indexer.set_document(x_doc) # Index each field with a suitable prefix. text = create_full_text(sentence_info) indexer.index_text(title_tokens, 1, 'S') indexer.index_text(title, 1, 'XS') indexer.index_text(text, 1, 'XT') # index terms indexer.index_text(title_tokens) indexer.increase_termpos() indexer.index_text(text) # store the title, text and sentence_dictionary (id -> sentence) # into the data blob data_blob = {} data_blob['title'] = title data_blob['sentences'] = dict(sentence_info) x_doc.set_data(json.dumps(data_blob)) x_doc.add_boolean_term(title) # save db.replace_document(docid, x_doc)
def __init__(self): self.db = xapian.WritableDatabase(SearchConfig.DB_PATH, xapian.DB_CREATE_OR_OPEN) stemmer = xapian.Stem('en') self.termgenerator = xapian.TermGenerator() self.termgenerator.set_stemmer(xapian.Stem("en")) self.sports = SportsData()
def index_voters(datapath): f = open(datapath) headers = csv.reader(f).next() voters = csv.DictReader(f, headers) db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) termgenerator = xapian.TermGenerator() for voternum, voter in enumerate(voters): if voternum % 1000 == 0: print voternum if voter['PARTY_AFFILIATION'] is None: # Must have reached end of file try: f.next() except StopIteration: break else: raise RuntimeError, 'Should never get here' doc = xapian.Document() termgenerator.set_document(doc) for field, prefix in fields.items(): termgenerator.index_text(voter[field], 1, prefix) termgenerator.index_text(voter[field]) termgenerator.increase_termpos() doc.set_data(json.dumps(voter)) # Make sure this record is only in the DB once doc.add_boolean_term(voter['SOS_VOTERID']) db.replace_document(voter['SOS_VOTERID'], doc)
def extract_terms(self, text): """ extract terms as if we are indexing """ doc = xapian.Document() tg = xapian.TermGenerator() tg.set_document(doc) text = text.replace('_', ' ') # xapian sees '_' as a word-character (to search for identifiers in source code) tg.index_text(text) return [t.term.decode('utf-8') for t in doc.termlist()]
def create_index(self): """ Create a new index, and set up its field structure """ log.warning("start create_index") self.db = xapian.WritableDatabase(self.dbpath, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() self.indexer.set_stemmer(xapian.Stem("en")) log.warning("end create_index")
def index(datapath, dbpath): # Create or open the database we're going to be writing to. db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) # Set up a TermGenerator that we'll use in indexing. termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in parse_states(datapath): # 'fields' is a dictionary mapping from field name to value. # Pick out the fields we're going to index. name = fields.get('name', u'') description = fields.get('description', u'') motto = fields.get('motto', u'') admitted = fields.get('admitted', None) population = fields.get('population', None) order = fields.get('order', u'') # We make a document and tell the term generator to use this. doc = xapian.Document() termgenerator.set_document(doc) # index each field with a suitable prefix termgenerator.index_text(name, 1, 'S') termgenerator.index_text(description, 1, 'XD') termgenerator.index_text(motto, 1, 'XM') # Index fields without prefixes for general search. termgenerator.index_text(name) termgenerator.increase_termpos() termgenerator.index_text(description) termgenerator.increase_termpos() termgenerator.index_text(motto) # Add document values. if admitted is not None: doc.add_value(1, xapian.sortable_serialise(int(admitted[:4]))) doc.add_value(2, admitted) # YYYYMMDD if population is not None: doc.add_value(3, xapian.sortable_serialise(int(population))) ### Start of example code. midlat = fields['midlat'] midlon = fields['midlon'] if midlat and midlon: doc.add_value(4, "%f,%f" % (float(midlat), float(midlon))) ### End of example code. # Store all the fields for display purposes. doc.set_data(json.dumps(fields)) # We use the order to ensure each object ends up in the # database only once no matter how many times we run the # indexer. idterm = u"Q" + order doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def make_tg(): termgen = xapian.TermGenerator() termgen.set_stemmer(xapian.Stem('en')) stopper = xapian.SimpleStopper() stopper.add('to') stopper.add('not') termgen.set_stopper(stopper) del stopper return termgen
def __init__(self, dbpath, schema_version, pid): self.schema = getattr(Schema, 'v%s' % schema_version) self.db_folder = '_%s_%s' % (dbpath, pid) self.s = load_scws() self.db = _database(self.db_folder, writable=True) self.termgen = xapian.TermGenerator() self.iter_keys = self.schema['origin_data_iter_keys'] self.pre_func = self.schema.get('pre_func', {})
def Do_Index(self, callback=None): """loop through all feeds and entries and feed them to the beast""" def index_interrupt(): self._indexing = False self._index_lock.release() if callback is not None: callback() self._interrupt() return if not self._index_lock.acquire(False): logging.info("already indexing, not trying to reindex again") return self._indexing = True db = self._get_db() c = db.cursor() #remove existing DB utils.deltree(self._storeDir) database = xapian.WritableDatabase(self._storeDir, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() stemmer = xapian.Stem("english") indexer.set_stemmer(stemmer) c.execute(u"""SELECT id, title, description FROM feeds""") feeds = c.fetchall() c.execute(u"""SELECT id, feed_id, title, description,fakedate FROM entries ORDER BY fakedate""") entries = c.fetchall() c.close() db.close() logging.info("indexing feeds") def feed_index_generator(feeds): for feed_id, title, description in feeds: try: doc = xapian.Document() forindex = title+" "+description #eh? we can only remove docs by term, but we can only #get values. so we need both it seems doc.add_term("f"+str(feed_id)) doc.add_value(FEED_ID, str(feed_id)) doc.add_value(DATE, "") doc.set_data(forindex) indexer.set_document(doc) indexer.index_text(forindex) #database.add_document(doc) yield doc except Exception, e: logging.error("Failed in indexDocs, feeds: %s" % str(e))
def index_aggregate(a): doc = xapian.Document() doc.add_value(VAL_URI, a.identifier) docid = u"URI" + a.identifier doc.add_term(docid) log.debug("Aggregate: %s" % a.identifier) def add_value(g, val_id, subject, predicate): val = [] for s, p, o in g.triples((subject, predicate, None)): if not o.language or o.language == "en": ### TODO: fix this val.append(o) if val: val = u", ".join(val) doc.add_value(val_id, val) return val ## create an abbreviated graph to store in the xapian database extract = Graph() add_value(a, VAL_LABEL, a.identifier, RDFS.label) for g in a.contexts(): log.debug("Indexing: %s" % g.identifier) for pred in (RDF.type, RDFS.label, RDFS.comment, DC.title, DC.description, FOAF.name): for statement in a.triples((g.identifier, pred, None)): extract.add(statement) title = add_value(g, VAL_TITLE, g.identifier, DC.title) if title: doc.add_term(u"ZT" + title[:160]) name = add_value(g, VAL_NAME, g.identifier, FOAF.name) if name: doc.add_term(u"NA" + name[:160]) doc.set_data(extract.serialize(format="n3")) ## take any fields that contain text, stem them according to their ## language (or english if unsupported or unspecified) and put them ## in the index termgen = xapian.TermGenerator() termgen.set_document(doc) for pred in (RDFS.label, RDFS.comment, DC.title, DC.description, FOAF.name, FOAF.first_name, FOAF.last_name, FOAF.surname): for s, p, o in a.triples((None, pred, None)): termgen.increase_termpos() if o.language: try: stemmer = xapian.Stem(o.language) except xapian.InvalidArgumentError: stemmer = xapian.Stem("en") else: stemmer = xapian.Stem("en") termgen.set_stemmer(stemmer) termgen.index_text(o) return docid, doc
def __init__(self, dbpath, *, cjk=False): """Initialize indexer with dbpath.""" self._db = None self.dbpath = dbpath self.term_generator = xapian.TermGenerator() self.term_generator.set_stemmer(xapian.Stem("en")) if cjk: self.term_generator.set_flags(self.term_generator.FLAG_CJK_NGRAM) logger.info("FLAG_CJK_NGRAM enabled") self.open()
def __init__(self, db_path): try: # Open the database for update, creating a new database if necessary. self.database = xapian.WritableDatabase(db_path, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() self.stemmer = xapian.Stem("english") # XXX self.indexer.set_stemmer(self.stemmer) except: raise
def __init__(self): try: self.extract = utilsXapian.Extract() self.indexer = xapian.TermGenerator() stemmer = xapian.Stem("french") self.indexer.set_stemmer(stemmer) self.fichier = "" self.database = xapian.WritableDatabase() except Exception, e: print >> sys.stderr, "Exception: %s" % str(e) sys.exit(1)
def index(datapath, dbpath): # Create or open the database we're going to be writing to. db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) # Set up a TermGenerator that we'll use in indexing. termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in parse_csv_file(datapath): # 'fields' is a dictionary mapping from field name to value. # Pick out the fields we're going to index. description = fields.get('DESCRIPTION', u'') title = fields.get('TITLE', u'') identifier = fields.get('id_NUMBER', u'') # We make a document and tell the term generator to use this. doc = xapian.Document() termgenerator.set_document(doc) # Index each field with a suitable prefix. termgenerator.index_text(title, 1, 'S') termgenerator.index_text(description, 1, 'XD') # Index fields without prefixes for general search. termgenerator.index_text(title) termgenerator.increase_termpos() termgenerator.index_text(description) # Store all the fields for display purposes. doc.set_data(json.dumps(fields, encoding='utf8')) ### Start of example code. # parse the two values we need measurements = fields.get('MEASUREMENTS', u'') if measurements != u'': numbers = numbers_from_string(measurements) if len(numbers) > 0: doc.add_value(0, xapian.sortable_serialise(max(numbers))) date_made = fields.get('DATE_MADE', u'') years = numbers_from_string(date_made) if len(years) > 0: doc.add_value(1, xapian.sortable_serialise(years[0])) ### End of example code. # We use the identifier to ensure each object ends up in the # database only once no matter how many times we run the # indexer. idterm = u"Q" + identifier doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def init(self, info, progress): """ If needed, perform long initialisation tasks here. info is a dictionary with useful information. Currently it contains the following values: "values": a dict mapping index mnemonics to index numbers The progress indicator can be used to report progress. """ self.indexer = xapian.TermGenerator()
def xapian_init_databases(): """ Initializes all database objects. """ xapian_ensure_db_dir(XAPIAN_DIR_NAME) for field in INDEXES: xapian_ensure_db_dir(XAPIAN_DIR_NAME + "/" + field) database = xapian.WritableDatabase(XAPIAN_DIR + "/" + field, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() stemmer = xapian.Stem("english") indexer.set_stemmer(stemmer) DATABASES[field] = (database, indexer)
def index( self, document, replace, flush=True ) : """Index the document, which is a list of, [ metadata, attributes, contents], The first item of 'attributes' {key:value} paid must be XID replace, if True, replace the existing index for id with new data. """ do_onetime( self.compmgr.config ) metadata, attributes, contents = document xdb_w = xap.WritableDatabase( dburl, xap.DB_CREATE_OR_OPEN ) indexer = xap.TermGenerator() doc = xap.Document() indexer.set_stemmer(stemmer) indexer.set_document( doc ) if replace : # Identify the document and its docid eq = xap.Enquire( xdb_r ) eq.set_query( xap.Query( 'XID%s' % attributes[0][4:] )) matches = eq.get_mset( 0, 10 ) else : matches = [] # Metadata for documents doc.add_value( DOCTYPE, metadata['doctype'] ) doc.add_value( ID, str(metadata['id']) ) 'projectname' in metadata and \ doc.add_value( PROJECTNAME, metadata['projectname'] ) # Prefixed terms for attr in attributes : prefix, term = attr.split( ':' ) indexer.index_text( term.encode('utf8'), 10, prefix ) # Index the content doc.set_data(' ;;; '.join([ content for content in contents ])) for i in range(len(contents)) : content = contents[i] indexer.index_text( content.encode('utf8'), i+1 ) if replace and len(matches) >= 1 : xdb_w.replace_document( matches[0].docid, doc ) flush and xdb_w.flush() else : xdb_w.add_document( doc ) flush and xdb_w.flush() return
def add_doc(self, doc, content, metadata): generator = xapian.TermGenerator() generator.index_text_without_positions(metadata, 1, self.METADATA_PREFIX) # Index the content of the document. generator.set_stemmer(xapian.Stem(doc.language_code)) generator.set_stopper(self._stoppers[doc.language_code]) generator.index_text(content, 1, self.CONTENT_PREFIX) xapian_doc = generator.get_document() for tag in doc.tags: xapian_doc.add_boolean_term(self.TAG_PREFIX + tag) xapian_doc.add_boolean_term(self.ID_PREFIX + doc.hash_md5) xapian_doc.set_data(doc.hash_md5) self._index.add_document(xapian_doc) self._index.flush()
def __init__(self, dbname, writeable=False): if writeable: self.__db = xapian.WritableDatabase(dbname, xapian.DB_CREATE_OR_OPEN) self.__indexer = xapian.TermGenerator() self.__indexer.set_stemmer(xapian.Stem('english')) else: self.__db = xapian.Database(dbname) self.__queryparser = xapian.QueryParser() self.__queryparser.set_stemmer(xapian.Stem('english')) self.__queryparser.set_database(self.__db) self.__queryparser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
def __init__(self, lang, val_popcon, progress=None): self.val_popcon = val_popcon self.progress = progress if lang is None: lang = "en" self.lang = lang self.xlang = lang.split("_")[0] self.xdglangs = Locale.expand_languages(lang) self.indexer = xapian.TermGenerator() # Get a stemmer for this language, if available try: self.stemmer = xapian.Stem(self.xlang) self.indexer.set_stemmer(self.stemmer) except xapian.InvalidArgumentError: pass