def create_index(self, path, *args, **kwargs): self._path = path self.index = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OVERWRITE)
def test_postingsource(): """Simple test of the PostingSource class. """ class OddPostingSource(xapian.PostingSource): def __init__(self, max): xapian.PostingSource.__init__(self) self.max = max def init(self, db): self.current = -1 def get_termfreq_min(self): return 0 def get_termfreq_est(self): return int(self.max / 2) def get_termfreq_max(self): return self.max def next(self, minweight): self.current += 2 def at_end(self): return self.current > self.max def get_docid(self): return self.current dbpath = 'db_test_postingsource' db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OVERWRITE) for id in range(10): doc = xapian.Document() db.add_document(doc) # Do a dance to check that the posting source doesn't get dereferenced too # soon in various cases. def mkenq(db): # First - check that it's kept when the source goes out of scope. def mkquery(): source = OddPostingSource(10) return xapian.Query(xapian.Query.OP_OR, [xapian.Query(source)]) # Check that it's kept when the query goes out of scope. def submkenq(): query = mkquery() enquire = xapian.Enquire(db) enquire.set_query(query) return enquire # Check it's kept when the query is retrieved from enquire and put into # a new enquire. def submkenq2(): enq1 = submkenq() enquire = xapian.Enquire(db) enquire.set_query(enq1.get_query()) return enquire return submkenq2() enquire = mkenq(db) mset = enquire.get_mset(0, 10) expect([item.docid for item in mset], [1, 3, 5, 7, 9]) db.close() shutil.rmtree(dbpath)
def rebuild_database(pathname, debian_sources=True, appstream_sources=False): #cache = apt.Cache(memonly=True) cache = get_pkg_info() cache.open() old_path = pathname + "_old" rebuild_path = pathname + "_rb" if not os.path.exists(rebuild_path): try: os.makedirs(rebuild_path) except: LOG.warn("Problem creating rebuild path '%s'." % rebuild_path) LOG.warn("Please check you have the relevant permissions.") return False # check permission if not os.access(pathname, os.W_OK): LOG.warn("Cannot write to '%s'." % pathname) LOG.warn("Please check you have the relevant permissions.") return False #check if old unrequired version of db still exists on filesystem if os.path.exists(old_path): LOG.warn("Existing xapian old db was not previously cleaned: '%s'." % old_path) if os.access(old_path, os.W_OK): #remove old unrequired db before beginning shutil.rmtree(old_path) else: LOG.warn("Cannot write to '%s'." % old_path) LOG.warn("Please check you have the relevant permissions.") return False # write it db = xapian.WritableDatabase(rebuild_path, xapian.DB_CREATE_OR_OVERWRITE) if debian_sources: update(db, cache) if appstream_sources: update_from_appstream_xml(db, cache) # write the database version into the filep db.set_metadata("db-schema-version", DB_SCHEMA_VERSION) # update the mo file stamp for the langpack checks mofile = gettext.find("app-install-data") if mofile: mo_time = os.path.getctime(mofile) db.set_metadata("app-install-mo-time", str(mo_time)) db.flush() # use shutil.move() instead of os.rename() as this will automatically # figure out if it can use os.rename or needs to do the move "manually" try: shutil.move(pathname, old_path) shutil.move(rebuild_path, pathname) shutil.rmtree(old_path) return True except: LOG.warn("Cannot copy refreshed database to correct location: '%s'." % pathname) return False
def __init__(self, root, writable=False, create=False, force=False): # xapers root self.root = os.path.abspath(os.path.expanduser(root)) # xapers db directory xapers_path = os.path.join(self.root, '.xapers') # xapes directory initialization if not os.path.exists(xapers_path): if create: if os.path.exists(self.root): if os.listdir(self.root) and not force: raise DatabaseInitializationError( 'Uninitialized Xapers root directory exists but is not empty.' ) os.makedirs(xapers_path) else: if os.path.exists(self.root): raise DatabaseInitializationError( "Xapers directory '%s' does not contain a database." % (self.root)) else: raise DatabaseUninitializedError( "Xapers directory '%s' not found." % (self.root)) # the Xapian db xapian_path = os.path.join(xapers_path, 'xapian') if writable: try: self.xapian = xapian.WritableDatabase(xapian_path, xapian.DB_CREATE_OR_OPEN) except xapian.DatabaseLockError: raise DatabaseLockError("Xapers database locked.") else: self.xapian = xapian.Database(xapian_path) stemmer = xapian.Stem("english") # The Xapian TermGenerator # http://trac.xapian.org/wiki/FAQ/TermGenerator self.term_gen = xapian.TermGenerator() self.term_gen.set_stemmer(stemmer) # The Xapian QueryParser self.query_parser = xapian.QueryParser() self.query_parser.set_database(self.xapian) self.query_parser.set_stemmer(stemmer) self.query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.query_parser.set_default_op(xapian.Query.OP_AND) # add boolean internal prefixes for name, prefix in self.BOOLEAN_PREFIX.items(): self.query_parser.add_boolean_prefix(name, prefix) # for prefixes that can be applied multiply to the same # document (like tags) set the filter grouping to use AND: # https://xapian.org/docs/apidoc/html/classXapian_1_1QueryParser.html#a67d25f9297bb98c2101a03ff3d60cf30 for name, prefix in self.BOOLEAN_PREFIX_MULTI.items(): self.query_parser.add_boolean_prefix(name, prefix, False) # add probabalistic prefixes for name, prefix in self.PROBABILISTIC_PREFIX.items(): self.query_parser.add_prefix(name, prefix) # add value facets for name, facet in self.NUMBER_VALUE_FACET.items(): self.query_parser.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(facet, name + ':')) # register known source prefixes # FIXME: can we do this by just finding all XSOURCE terms in # db? Would elliminate dependence on source modules at # search time. for source in Sources(): name = source.name self.query_parser.add_boolean_prefix( name, self._make_source_prefix(name))
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print "Unhandled constants: ", res return res # Check that SWIG isn't generating cvar (regression test for ticket#297). expect_exception(AttributeError, "'module' object has no attribute 'cvar'", access_cvar) stem = xapian.Stem("english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data("a\0b") if doc.get_data() == "a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data("is there anybody out there?") doc.add_term("XYzzy") doc.add_posting(stem("is"), 1) doc.add_posting(stem("there"), 2) doc.add_posting(stem("anybody"), 3) doc.add_posting(stem("out"), 4) doc.add_posting(stem("there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query(xapian.Query(xapian.Query.OP_OR, terms), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query(query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected: expect_exception(xapian.DatabaseNotFoundError, None, xapian.Database, "nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB) expect_exception(xapian.DatabaseNotFoundError, None, xapian.WritableDatabase, "nosuchdir/nosuchdb", xapian.DB_OPEN|xapian.DB_BACKEND_STUB) expect_exception(xapian.NetworkError, None, xapian.remote_open, "/bin/false", "") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, "/bin/false", "") expect_exception(xapian.NetworkError, None, xapian.remote_open, "127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, "127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) # Check Xapian::BAD_VALUENO is wrapped suitably. enq.set_collapse_key(xapian.BAD_VALUENO) enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = " ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, "is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist("there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, "there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to('n') while True: try: x = next(term) except StopIteration: break if x.term < 'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term) # Feature test for Document.values count = 0 for term in doc.values(): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data("Two") doc.add_posting(stem("out"), 1) doc.add_posting(stem("outside"), 1) doc.add_posting(stem("source"), 2) doc.add_value(0, "yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == "yes" query = xapian.Query(stem("out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith('a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith('a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query(qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(0 * <alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem('en')) expect_query(qp.parse_query("foo ox", qp.FLAG_PARTIAL), "(Zfoo@1 AND (WILDCARD SYNONYM ox OR Zox@2))") expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND (WILDCARD SYNONYM outside OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'), '(foo OR bar)') expect_query(qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(0 * <alldocuments> AND_NOT Zt\xc3\xa9st@1)") doc = xapian.Document() doc.set_data(u"Unicode with an acc\xe9nt") doc.add_posting(stem(u"out\xe9r"), 1) expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8')) term = doc.termlist().next().term expect(term, u"out\xe9r".encode('utf-8')) # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add('a') expect(stop('a'), True) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == "b" def get_description(self): return u"my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), u"my_b_stopper") qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop('b'), True) expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test SimpleStopper initialised from a file. try: srcdir = os.environ['srcdir'] except KeyError: srcdir = '.' stop = xapian.SimpleStopper(srcdir + '/../shortstop.list') expect(stop('a'), True) expect(stop('am'), False) expect(stop('an'), True) expect(stop('the'), True) expect_exception(xapian.InvalidArgumentError, None, xapian.SimpleStopper, 'nosuchfile') # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text('foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])]) # Check DateRangeProcessor works context("checking that DateRangeProcessor works") qp = xapian.QueryParser() rpdate = xapian.DateRangeProcessor(1, xapian.RP_DATE_PREFER_MDY, 1960) qp.add_rangeprocessor(rpdate) query = qp.parse_query('12/03/99..12/04/01') expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)') # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) qp.add_boolean_prefix('boolspam2', testfieldprocessor(), False) # Old-style qp.add_boolean_prefix('boolspam3', testfieldprocessor(), '') qp.add_boolean_prefix('boolspam4', testfieldprocessor(), 'group') qp.add_boolean_prefix('boolspam5', testfieldprocessor(), None) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query("I like tea") # Regression test for bug fixed in 1.4.4: # https://bugs.debian.org/849722 oqparser.add_boolean_prefix('tag', 'K', '') # Make sure other cases also work: oqparser.add_boolean_prefix('zag', 'XR', False) # Old-style oqparser.add_boolean_prefix('rag', 'XR', None) oqparser.add_boolean_prefix('nag', 'XB', '') oqparser.add_boolean_prefix('bag', 'XB', 'blergh') oqparser.add_boolean_prefix('gag', 'XB', u'blergh') oqparser.add_boolean_prefix('jag', 'XB', b'blergh') # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata('Foo'), '') db.set_metadata('Foo', 'Foo') expect(db.get_metadata('Foo'), 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query(xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5), "5 * foo")
def test_replication_concurrency(): """Test concurrent replication and modification """ builddir = os.environ['abs_builddir'] dbsdir = os.path.join(builddir, 'dbs_replication') if not os.path.isdir(dbsdir): os.makedirs(dbsdir) masterpath = os.path.join(dbsdir, 'master') firstpath = os.path.join(dbsdir, 'first') secondpath = os.path.join(dbsdir, 'second') slavepath = os.path.join(dbsdir, 'slave') if os.path.isdir(masterpath): shutil.rmtree(masterpath) if os.path.isdir(slavepath): shutil.rmtree(slavepath) port = 7876 expect_exception( xapian.DatabaseOpeningError, "Couldn't stat '" + dbsdir + "/slave' (No such file or directory)", xapian.Database, slavepath) clientp = None serverp = subprocess.Popen(( '../../xapian-core/bin/xapian-replicate-server', dbsdir, '--port=7876', ), ) doccount1 = 10000 doccount2 = 1000 starttime = time.time() if not os.path.isdir(firstpath): firstdb = xapian.WritableDatabase(firstpath, xapian.DB_CREATE_OR_OVERWRITE) # Make an initial, large database print print "Building initial database ..." for num in xrange(1, doccount1): doc = xapian.Document() val = 'val%d' % num doc.add_value(1, val) firstdb.add_document(doc) if num % 100000 == 0: print "%d documents..." % num firstdb.set_metadata('dbname', '1') firstdb.commit() print "built" # The secondary database gets modified during the test, so needs to be # cleared now. shutil.rmtree(secondpath) if not os.path.isdir(secondpath): seconddb = xapian.WritableDatabase(secondpath, xapian.DB_CREATE_OR_OVERWRITE) # Make second, small database print print "Building secondary database ..." for num in xrange(1, doccount2): doc = xapian.Document() val = 'val%d' % num doc.add_value(1, val) seconddb.add_document(doc) if num % 100000 == 0: print "%d documents..." % num seconddb.set_metadata('dbname', '2') seconddb.commit() print "built" if time.time() - starttime < 1: time.sleep(1) # Give server time to start try: set_master(masterpath, firstpath) clientp = subprocess.Popen(( '../../xapian-core/bin/xapian-replicate', '--host=127.0.0.1', '--master=master', os.path.join(dbsdir, 'slave'), '--interval=0', '--port=7876', '-r 0', ), ) time.sleep(1) # Give client time to start expect(xapian.Database(slavepath).get_metadata('dbname'), '1') for count in xrange(10): # Test that swapping between databases doesn't confuse replication. for count2 in xrange(2): set_master(masterpath, secondpath) time.sleep(0.1) set_master(masterpath, firstpath) time.sleep(0.1) # Test making changes to the database. set_master(masterpath, secondpath) masterdb = xapian.WritableDatabase(masterpath, xapian.DB_OPEN) print "making 100 changes" for num in xrange(100): masterdb.set_metadata('num%d' % num, str(num + count)) masterdb.commit() print "changes done" masterdb.close() # Allow time for the replication client to catch up with the # changes. time.sleep(2) expect(xapian.Database(slavepath).get_metadata('dbname'), '2') expect( xapian.Database(slavepath).get_metadata('num99'), str(99 + count)) finally: if clientp is not None: os.kill(clientp.pid, 9) clientp.wait() os.kill(serverp.pid, 9) serverp.wait()
db.authenticate('root','root') db = connection.weibo print 'pymongo success' #stopwords stopwords = set([line.strip('\r\n') for line in file('ext_stopword.dic')]) #emotionlist emotionlist = [unicode(line.strip('\r\n'),'utf-8') for line in file('emotionlist.txt')] if len(sys.argv) != 2: print >> sys.stderr, "Usage: %s PATH_TO_DATABASE" % sys.argv[0] sys.exit(1) try: # Open the database for update, creating a new database if necessary. database = xapian.WritableDatabase(sys.argv[1], xapian.DB_CREATE_OR_OPEN) print database,'open database weibo' emotionvi = 0 keywordsvi = 1 timestampvi = 2 loctvi = 3 reploctvi = 4 emotiononlyvi = 5 usernamevi = 6 hashtagsvi = 7 uidvi = 8 repnameslistvi = 9 widvi = 10 """ weibos = ''
def create_index(self): """ Create a new index, and set up its field structure """ self.db = xapian.WritableDatabase(self.dbpath, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() self.indexer.set_stemmer(xapian.Stem("en"))
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. """ import xapian import sys if __name__ == '__main__': try: sample_file = sys.argv[1] popcon = xapian.WritableDatabase(sys.argv[2], xapian.DB_OPEN) except: print "Usage: extract-sample-db sample_file popcon_index" exit(1) enquire = xapian.Enquire(popcon) print sample_file.split("/") new_popcon = xapian.WritableDatabase( sys.argv[2] + "-" + sample_file.split("/")[-1], xapian.DB_CREATE_OR_OVERWRITE) print("Popcon repository size: %d" % popcon.get_doccount()) for submission in open(sample_file): print "ID" + submission.strip() query = xapian.Query("ID" + submission.strip()) enquire.set_query(query) mset = enquire.get_mset(0, 20) for m in mset:
def _read_write_db(self): """Retruns a read-write xapian Database object.""" return xapian.WritableDatabase(settings.SEARH_INDEX_PATH, xapian.DB_CREATE_OR_OPEN)
def create_database(self): database = xapian.WritableDatabase( self._path, xapian.DB_CREATE_OR_OPEN, ) del database
def _open_collection(path, rw=READ): if rw == READ: return xapian.Database(path) elif rw == WRITE: return xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
from config import SEARCH_DB_PATH from mmseg.search import seg_txt_search, seg_title_search, seg_txt_2_dict from os import makedirs from os.path import join, exists import xapian from collections import defaultdict PATH = join(SEARCH_DB_PATH, 'zsite') if not exists(PATH): makedirs(PATH) SEARCH_DB = xapian.WritableDatabase(PATH, xapian.DB_CREATE_OR_OPEN) def flush_db(): SEARCH_DB.flush() def index(keyword_iter): for id, cid, rank, kw in keyword_iter(): doc = xapian.Document() doc.add_value(0, id) doc.add_value(1, xapian.sortable_serialise(rank)) doc.add_value(2, cid) for word, value in kw: if word: if not word.startswith('>'):
# Import system modules import os import xapian import datetime # Import custom modules from query_process_simplified import TextMachine # Load the Xapian database databasePath = os.path.abspath('xapian-database') database = xapian.WritableDatabase(databasePath, xapian.DB_OPEN) # Set slot constants xapian_file_name, xapian_when, xapian_owner_id = xrange(3) def search(queryString, byDate=False, ownerID=None, extractLength=32): # Parse query string queryParser = xapian.QueryParser() queryParser.set_stemmer(xapian.Stem('english')) queryParser.set_database(database) queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = queryParser.parse_query(queryString) # Set offset and limit for pagination offset, limit = 0, database.get_doccount() # Start query session enquire = xapian.Enquire(database) enquire.set_query(query) # Sort by date if byDate: enquire.set_sort_by_value(xapian_when) if ownerID == None: matches = enquire.get_mset(offset, limit)
def _get_database(self): index = os.path.join(self.db_path, 'text-index') return xapian.WritableDatabase(index, xapian.DB_CREATE_OR_OPEN)
# -*- coding: utf-8 -*- from __future__ import unicode_literals import xapian kw = ["苹果", "成都"] db = xapian.WritableDatabase("db/test", xapian.DB_OPEN) parser = xapian.QueryParser() # for w in kw: # print w # query = parser.parse_query(w) # query_list.append(query) query = parser.parse_query(kw) enquire = xapian.Enquire(db) enquire.set_query(query) for m in enquire.get_mset(0, 30): print m.docid
def indexer(self, **kwargs): path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname) self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) self.ixer = xapian.TermGenerator()
print(data_file, "is complete!") return curr_docid if __name__ == '__main__': CORPUS_DIR = './wiki-pages-text/' DATA_FILES = os.listdir(CORPUS_DIR) DB_PATH = './xdb/' DB_NAME = 'wiki.db' # try to make a db in pwd try: os.mkdir(DB_PATH) print("create dir", DB_PATH) except (OSError, IOError) as e: if e.errno != errno.EEXIST: raise START = time() with closing( xapian.WritableDatabase(join(DB_PATH, DB_NAME), xapian.DB_CREATE_OR_OPEN)) as x_db: curr_docid = 1 for data_file in tqdm(DATA_FILES): if not data_file.endswith('.txt'): continue curr_docid = save_2_db(x_db, CORPUS_DIR, data_file, curr_docid) print("took", time() - START, "seconds to finish")
if __name__ == "__main__": import sys from time import time import linecache import glob import traceback import linecache import xapian import re stem = xapian.Stem("french") ti = xapian.inmemory_open() ti = xapian.WritableDatabase("test.ti", xapian.DB_CREATE_OR_OPEN) # ti = xapian.quartz_open('test.idx') # start = time() # lines = 0 # for f in glob.glob('*.txt'): # print f, # for linenumber, line in enumerate(file(f,'rb')): # lines += 1 # line = line.strip() # doc = xapian.Document() # doc.set_data('%12s:%04i'%(f,linenumber)) # for word_number, word in enumerate(re.findall(r'\w+',line.lower())): # doc.add_posting(word,word_number) # ti.add_document(doc) # if linenumber % 100 == 0: # sys.stdout.write('.') # print 'OK' # print 'Indexing time : %.2fs for %i lines'%(time()-start,lines)
def spin(self): cursor = self._spinner[self._counter % len(self._spinner)] self._counter += 1 print('\b' + cursor, end='') sys.stdout.flush() spinner = Spinner() # http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python textchars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7f}) is_binary_string = lambda bytes: bool(bytes.translate(None, textchars)) try: xdb = xapian.WritableDatabase("__xdb__", xapian.DB_CREATE_OR_OVERWRITE) indexer = xapian.TermGenerator() stemmer = xapian.Stem("english") indexer.set_stemmer(stemmer) # scan the project counter_indexed = 0 pathlist_indexed = [] for dirpath, _, filenames in os.walk("."): if ".git" in dirpath or "__xdb__" in dirpath: continue for filename in filenames: cursor = os.path.join(dirpath, filename) # skip non-plain files
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print("Unhandled constants: ", res) return res # Check that SWIG isn't generating cvar (regression test for ticket#297). # # Python 3.5 generates a different exception message here to earlier # versions, so we need a check which matches both. expect_exception(AttributeError, lambda msg: msg.find("has no attribute 'cvar'") != -1, access_cvar) stem = xapian.Stem(b"english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data(b"a\0b") if doc.get_data() == b"a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), b"a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data(b"is there anybody out there?") doc.add_term(b"XYzzy") doc.add_posting(stem(b"is"), 1) doc.add_posting(stem(b"there"), 2) doc.add_posting(stem(b"anybody"), 3) doc.add_posting(stem(b"out"), 4) doc.add_posting(stem(b"there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query( xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, (b"smoke", b"test", b"tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query(b"smoke"), query1, b"string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query( xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected (or not wrapped # in the first cases): expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb")) expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS)) expect_exception( xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT)) expect_exception(xapian.NetworkError, None, xapian.remote_open, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open, b"127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = b" ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, b"is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist(b"there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(b""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, b"there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to(b'n') while True: try: x = next(term) except StopIteration: break if x.term < b'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term.decode('utf-8')) # Feature test for Document.values count = 0 for term in list(doc.values()): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data(b"Two") doc.add_posting(stem(b"out"), 1) doc.add_posting(stem(b"outside"), 1) doc.add_posting(stem(b"source"), 2) doc.add_value(0, b"yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == b"yes" query = xapian.Query(stem(b"out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith(b'a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith(b'a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, b"test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem(b'en')) expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))") expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')), '(foo OR bar\\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')), '(foo OR bar\u00a3)') expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'), '(foo OR bar)') expect_query( qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\u00e9st@1)") doc = xapian.Document() doc.set_data(b"Unicode with an acc\xe9nt") doc.add_posting(stem(b"out\xe9r"), 1) expect(doc.get_data(), b"Unicode with an acc\xe9nt") term = next(doc.termlist()).term expect(term, b"out\xe9r") # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add(b'a') expect(stop(b'a'), True) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == b"b" def get_description(self): return "my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), "my_b_stopper") qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop(b'b'), True) expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text(b'foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]), (b'foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query(b'12/03/99..12/04/01') expect(str(query), 'Query(0 * VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, b'$', True) a = '$10' b = '20' slot, a, b = vrp(a, b.encode('utf-8')) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') # FIXME: This doesn't currently work: # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query(b"I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata(b'Foo'), b'') db.set_metadata(b'Foo', b'Foo') expect(db.get_metadata(b'Foo'), b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, b'', b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5), "5 * foo")
def test_application_details(self): db = xapian.WritableDatabase("./data/test.db", xapian.DB_CREATE_OR_OVERWRITE) res = update_from_app_install_data(db, self.cache, datadir="./data/desktop") self.assertTrue(res) db = StoreDatabase("./data/test.db", self.cache) db.open(use_axi=False, use_agent=False) self.assertEqual(len(db), 5) # test details app = Application("Ubuntu Software Center Test", "software-center") details = app.get_details(db) self.assertNotEqual(details, None) self.assertEqual(details.component, "main") self.assertEqual(details.pkgname, "software-center") # get the first document for doc in db: if doc.get_data() == "Ubuntu Software Center Test": appdetails = AppDetails(db, doc=doc) break # test get_appname and get_pkgname self.assertEqual(db.get_appname(doc), "Ubuntu Software Center Test") self.assertEqual(db.get_pkgname(doc), "software-center") # test appdetails self.assertEqual(appdetails.name, "Ubuntu Software Center Test") self.assertEqual(appdetails.pkgname, "software-center") # FIXME: add a dekstop file with a real channel to test # and monkey-patch/modify the APP_INSTALL_CHANNELS_PATH self.assertEqual(appdetails.channelname, None) self.assertEqual(appdetails.channelfile, None) self.assertEqual(appdetails.component, "main") self.assertNotEqual(appdetails.pkg, None) # from the fake test/data/appdetails/var/lib/dpkg/status self.assertEqual(appdetails.pkg.is_installed, True) self.assertTrue(appdetails.pkg_state in (PkgStates.INSTALLED, PkgStates.UPGRADABLE)) # FIXME: test description for unavailable pkg self.assertTrue( appdetails.description.startswith( "Ubuntu Software Center lets you")) # FIXME: test appdetails.website self.assertEqual(appdetails.icon, "softwarecenter") # crude, crude self.assertTrue(len(appdetails.version) > 2) # FIXME: screenshots will only work on ubuntu self.assertTrue( re.match( "http://screenshots.ubuntu.com/screenshot-with-version/software-center/[\d.]+", appdetails.screenshot)) self.assertTrue( re.match( "http://screenshots.ubuntu.com/thumbnail-with-version/software-center/[\d.]+", appdetails.thumbnail)) # FIXME: add document that has a price self.assertEqual(appdetails.price, '') self.assertEqual(appdetails.license, "Open source") # test lazy history loading for installation date self.ensure_installation_date_and_lazy_history_loading(appdetails) # test apturl replacements # $kernel app = Application("", "linux-headers-$kernel", "channel=$distro-partner") self.assertEqual(app.pkgname, 'linux-headers-' + os.uname()[2]) # $distro details = app.get_details(db) distro = get_distro().get_codename() self.assertEqual(app.request, 'channel=' + distro + '-partner')
def update_xapiandb(self, kwargs): database = xapian.WritableDatabase(XAPIAN_DB_PATH, xapian.DB_OPEN) DB = xapian.Database(XAPIAN_DB_PATH) enquire = xapian.Enquire(database) indexer = xapian.TermGenerator() if "" == kwargs["pkgname"]: modified_num = 0 add_num = 0 xapiandb_update = "No" query_xapiandb_version = xapian.Query("the_#ukxapiandb#_version") enquire.set_query(query_xapiandb_version) matches = enquire.get_mset(0, 1) for re in matches: docid_for_xapiandb_version = re.document.get_docid() doc_for_xapiandb_version = re.document doc_data = doc_for_xapiandb_version.get_data() if (isinstance(doc_data,bytes)): doc_data = doc_data.decode(encoding='utf-8') if ("XAPIANDB_VERSION" == doc_data): the_latest_update_time = doc_for_xapiandb_version.get_value(2) #valueslot:2 xapiandb update time if (isinstance(the_latest_update_time,bytes)): the_latest_update_time = the_latest_update_time.decode(encoding='utf-8') else: the_latest_update_time = time.strftime('%Y-%m-%dT%H:%M:%S',time.localtime()) if (Globals.DEBUG_SWITCH): print("Failed to get the latest update time from client xapiandb,use default time.localtime()") reslist = self.premoter.newerapp_for_xapianupdate(the_latest_update_time) for app in reslist: app_name = str(app["app_name"]) display_name_cn = str(app["display_name_cn"]) keywords_for_search = str(app["keywords_for_search"]) query = xapian.Query(app_name) enquire.set_query(query) doccount = DB.get_doccount() matches = enquire.get_mset(0,doccount) flag = 1 if matches.size() != 0: for re in matches: get_name = re.document.get_data() if (isinstance(get_name,bytes)): get_name = get_name.decode(encoding='utf-8') if get_name == app_name: flag = 0 docid = re.docid doc = re.document doc.clear_terms() indexer.set_document(doc) doc.add_term(app_name,10) if keywords_for_search != "None": keywords = display_name_cn+";"+keywords_for_search+";"+app_name else: keywords = display_name_cn+";"+app_name indexer.index_text(keywords,10) try: from mmseg.search import seg_txt_search,seg_txt_2_dict for word, value in seg_txt_2_dict(keywords).items(): if word != "none": doc.add_term(word,10) else: pass except: if (Globals.DEBUG_SWITCH): print("----No mmseg model---") database.replace_document(docid,doc) xapiandb_update = "Yes" modified_num = modified_num + 1 else: continue if flag: doc = xapian.Document() doc.set_data(app_name) doc.add_term(app_name,10) indexer.set_document(doc) if keywords_for_search != "None": keywords = display_name_cn+";"+keywords_for_search+";"+app_name else: keywords = display_name_cn+";"+app_name indexer.index_text(keywords,10) try: for word,value in seg_txt_2_dict(keywords).items(): if word != "none": doc.add_term(word,10) else: pass except: pass database.add_document(doc) add_num = add_num + 1 if (Globals.DEBUG_SWITCH): print("App:",doc.get_data()," ","terms:", end=' ') for itr in doc.termlist(): if (Globals.DEBUG_SWITCH): print(itr.term, end=' ') xapiandb_update = "Yes" if (Globals.DEBUG_SWITCH): print(" ") try: if xapiandb_update == "Yes": now = time.strftime('%Y-%m-%dT%H:%M:%S',time.localtime()) doc_for_xapiandb_version.add_value(2,now) database.replace_document(docid_for_xapiandb_version, doc_for_xapiandb_version) database.commit() if (Globals.DEBUG_SWITCH): print("Xapiandb has updated . %d app modified, %d app add. Tatal: %d app updated"%(modified_num,add_num,len(reslist))) except: if (Globals.DEBUG_SWITCH): print("The xapian database (/home/ice_bird/.cache/uksc/xapiandb) is crashed,please remove it and install a new one!") if (Globals.DEBUG_SWITCH): print("update uksc xapiandb over") else: appinfo_query = xapian.Query(kwargs["pkgname"]) enquire.set_query(appinfo_query) matches = enquire.get_mset(0, DB.get_doccount()) for re in matches: doc_for_appinfo = re.document doc_data = doc_for_appinfo.get_data() if kwargs["pkgname"] == doc_data: return doc = xapian.Document() doc.set_data(kwargs["pkgname"]) doc.add_term(kwargs["pkgname"], 10) if (Globals.DEBUG_SWITCH): print("debfile path:", kwargs["path"]) deb = DebFile(kwargs["path"]) terms = kwargs["pkgname"] try: terms = terms + " " + deb.description except: if (Globals.DEBUG_SWITCH): print("Failed to get app description") indexer.set_document(doc) indexer.index_text(terms) database.add_document(doc) database.commit() if (Globals.DEBUG_SWITCH): print("update xapiandb over: ", kwargs["pkgname"], "terms:", end=' ') for itr in doc.termlist(): if (Globals.DEBUG_SWITCH): print(itr.term, end=' ') if (Globals.DEBUG_SWITCH): print(" ")
else: # create a new database if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError, err_msg: raise OSError("Indexer: failed to create the parent " \ + "directory (%s) of the indexing database: %s" \ % (parent_path, str(err_msg))) try: self.writer = xapian.WritableDatabase(self.location, xapian.DB_CREATE_OR_OPEN) self.flush() except xapian.DatabaseOpeningError, err_msg: raise OSError("Indexer: failed to open or create a xapian " \ + "database (%s): %s" % (self.location, str(err_msg))) def __del__(self): self.reader = None self._writer_close() def flush(self, optimize=False): """force to write the current changes to disk immediately @param optimize: ignored for xapian @type optimize: bool """
##export XAPIAN_FLUSH_THRESHHOLD=200000; python index_alldoc.py import xapian import time DATA_FILEPATH = "/Users/neesergparajuli/Dropbox/Webtext/Data/wiki-pages-text/" DATABASE_FILEPATH = "/Users/neesergparajuli/Dropbox/Webtext/Data/XxapianDatabase" start = time.time() db = xapian.WritableDatabase(DATABASE_FILEPATH, xapian.DB_CREATE_OR_OPEN) termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for i in range(1, 110): st = "wiki-{:03d}.txt".format(i) cyclestart = time.time() print(st) j = 0 with open(DATA_FILEPATH + st) as file: #Create the databse for line in file: words = line.split(' ') #extract the title from the id id1 = words[0] title = id1.split('_') title = ' '.join(title) title = title.split('-') title = ' '.join(title) #check fact number is given and create doc ID
def __init__(self, root): self.root = root self.db = xapian.WritableDatabase(self.root, xapian.DB_CREATE_OR_OPEN)