def __init__(self, path=None, name='master_timeline_weibo', stub=None, include_remote=False, schema=Schema, schema_version=SCHEMA_VERSION): def create(dbpath): return _database(dbpath) def merge(db1, db2): db1.add_database(db2) return db1 if stub: # 如果是list,默认全部为文件 if isinstance(stub, list): self.database = reduce(merge, map(_stub_database, stub)) elif os.path.isfile(stub): self.database = _stub_database(stub) elif os.path.isdir(stub): self.database = reduce(merge, map(_stub_database, [os.path.join(stub, p) for p in os.listdir(stub)])) else: self.database = reduce(merge, map(create, [os.path.join(path, p) for p in os.listdir(path) if p.startswith('_%s' % name)])) self.schema = getattr(schema, 'v%s' % schema_version) enquire = xapian.Enquire(self.database) enquire.set_weighting_scheme(xapian.BoolWeight()) # 使用最简单的weight模型提升效率 enquire.set_docid_order(xapian.Enquire.DONT_CARE) # 不关心mset的顺序 if 'collapse_valueno' in self.schema: enquire.set_collapse_key(self.schema['collapse_valueno']) self.enquire = enquire self.include_remote = include_remote
def select_weight(option): if option == 0: bm = xapian.BB2Weight(1.0) elif option == 1: bm = xapian.BM25PlusWeight(1.0, 0, 1.0, 0.5, 0.5, 1.0) elif option == 2: bm = xapian.BM25Weight(1.0, 0.0, 1.0, 0.5, 0.3) elif option == 3: bm = xapian.BoolWeight() elif option == 4: bm = xapian.CoordWeight() elif option == 5: bm = xapian.DLHWeight() #maybe some problem elif option == 6: bm = xapian.DPHWeight() elif option == 7: bm = xapian.IfB2Weight(1) elif option == 8: bm = xapian.IneB2Weight(1) elif option == 9: bm = xapian.InL2Weight(1) elif option == 10: bm = xapian.LMWeight( 0.0, 1, -1.0, -1.0) #the second parameter is TWO_STAGE_SMOOTHING elif option == 11: bm = xapian.PL2PlusWeight(1, 0.8) elif option == 12: bm = xapian.PL2Weight(1) elif option == 13: bm = xapian.TfIdfWeight("ntn") elif option == 14: bm = xapian.TradWeight(1.0) return bm
def Search(self, command, blacklist=[], include=['feeds','entries'], since=0): """returns two lists, one of search results in feeds, and one for results in entries. It is sorted so that title results are first, description results are second""" if not self._index_lock.acquire(False): #if we are indexing, don't try to search #print "wouldn't get lock" return ([],[]) self._index_lock.release() database = xapian.Database(self._storeDir) enquire = xapian.Enquire(database) qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) enquire.set_docid_order(xapian.Enquire.DESCENDING) enquire.set_weighting_scheme(xapian.BoolWeight()) # Display the results. #print "%i results found." % matches.get_matches_estimated() #print "Results 1-%i:" % matches.size() #for m in matches: # print "%i: %i%% docid=%i [%s] %s %s %s" % (m.rank + 1, m.percent, m.docid, m.document.get_data()[0:100], m.document.get_value(0), m.document.get_value(1), m.document.get_value(2)) feed_results=[] entry_results=[] query = qp.parse_query(command) enquire.set_query(query) matches = enquire.get_mset(0, 100) for m in matches: doc = m.document feed_id = doc.get_value(FEED_ID) feed_id = int(feed_id) try: if feed_id not in blacklist: entry_id = doc.get_value(ENTRY_ID) if entry_id is '': # meaning this is actually a feed (we could know that from above, but eh) feed_results.append(int(feed_id)) else: # meaning "entry" title = doc.get_value(ENTRY_TITLE) fakedate = float(doc.get_value(DATE)) / 1000.0 if fakedate > since: entry_results.append((int(entry_id),title, fakedate, feed_id)) #else: # print "excluding:"+doc.get("title") except Exception, e: print e print feed_id print blacklist
def search(self, query=None, sort_by=None, start_offset=0, max_offset=None, fields=None, count_only=False, **kwargs): query = self.parse_query(query) if xapian.Query.empty(query): return 0, lambda: [] database = self.database enquire = xapian.Enquire(database) enquire.set_weighting_scheme(xapian.BoolWeight()) # 使用最简单的weight模型提升效率 enquire.set_docid_order(xapian.Enquire.DONT_CARE) # 不关心mset的顺序 enquire.set_query(query) if 'collapse_valueno' in self.schema: enquire.set_collapse_key(self.schema['collapse_valueno']) if count_only: return self._get_hit_count(database, enquire) if sort_by: self._set_sort_by(enquire, sort_by) if not max_offset: max_offset = database.get_doccount() - start_offset mset = self._get_enquire_mset(database, enquire, start_offset, max_offset) mset.fetch() # 提前fetch,加快remote访问速度 def result_generator(): if fields is not None and set(fields) <= set(['terms']): for match in mset: # 如果fields为[], 这情况下,不返回任何一项 item = {} if 'terms' in fields: item['terms'] = {term.term[5:]: term.wdf for term in match.document.termlist() if term.term.startswith('XTEXT')} yield item else: for match in mset: r = msgpack.unpackb(self._get_document_data(database, match.document)) if fields is not None: item = {} for field in fields: if field == 'terms': item['terms'] = {term.term[5:]: term.wdf for term in match.document.termlist() if term.term.startswith('XTEXT')} else: item[field] = r.get(field) else: item = r yield item return mset.size(), result_generator
def XapLookup(query): import xapian xapian_file = "../../undata/xapdex.db/" #sys.argv[1] xapian_db = xapian.Database(xapian_file) xapian_enquire = xapian.Enquire(xapian_db) xapian_query = xapian.QueryParser() xapian_query.set_stemming_strategy(xapian.QueryParser.STEM_NONE) xapian_query.set_default_op(xapian.Query.OP_AND) xapian_query.add_boolean_prefix("id", "I") xapian_query.add_boolean_prefix("subid", "J") xapian_query.add_boolean_prefix("class", "C") xapian_query.add_boolean_prefix("name", "S") xapian_query.add_boolean_prefix("nation", "N") xapian_query.add_boolean_prefix("language", "L") xapian_query.add_boolean_prefix("document", "D") xapian_query.add_boolean_prefix("reference", "R") xapian_query.add_boolean_prefix("date", "E") xapian_query.add_boolean_prefix("agenda", "A") xapian_query.add_boolean_prefix("vote", "V") xapian_query.add_boolean_prefix("session", "Z") # Stop words in scraper/xapdex.py must match those here xapian_stopper = xapian.SimpleStopper() xapian_stopper.add('the') for letter1 in range(ord('a'), ord('z')): xapian_stopper.add(chr(letter1)) for letter2 in range(ord('a'), ord('z')): xapian_stopper.add(chr(letter1) + chr(letter2)) xapian_query.set_stopper(xapian_stopper) parsed_query = xapian_query.parse_query(query, 16 + 4 + 2 + 1) # allows wildcards #print "desc:", parsed_query.get_description() xapian_enquire.set_query(parsed_query) xapian_enquire.set_sort_by_value(0, xapian.Enquire.ASCENDING) xapian_enquire.set_weighting_scheme(xapian.BoolWeight()) # do sorting etc. here matches = xapian_enquire.get_mset(0, 500) # XXX 500 as constant is dodgy here res = [] # print "matches", matches.size() for match in matches: #print match[4].get_value(0), match[4].get_data() res.append(match[4].get_data()) return res
def search(query, active_element, numresults=10): # Put active_element at end of query qfields = sorted(query, key=lambda k: k == active_element) # XXX There should be a way to do this without going through an # intermediate string, and without adding prefixes. qvalues = [(k, e) for k in qfields for e in query[k].split()] qstring = ['%s:%s' % (field, value) for field, value in qvalues] querystring = ' AND '.join(qstring) db = xapian.Database(dbpath) queryparser = xapian.QueryParser() queryparser.set_database(db) for field, abbrev in fields.items(): queryparser.add_prefix(field, abbrev) query = queryparser.parse_query(querystring, queryparser.FLAG_BOOLEAN | queryparser.FLAG_PARTIAL | queryparser.FLAG_WILDCARD) enquire = xapian.Enquire(db) enquire.set_weighting_scheme(xapian.BoolWeight()) enquire.set_query(query) return [json.loads(r.document.get_data()) for r in enquire.get_mset(0, numresults)]
xapian_query.set_stemming_strategy(xapian.QueryParser.STEM_NONE) xapian_query.set_default_op(xapian.Query.OP_AND) xapian_query.add_boolean_prefix("id", "I") xapian_query.add_boolean_prefix("subid", "J") xapian_query.add_boolean_prefix("class", "C") xapian_query.add_boolean_prefix("name", "S") xapian_query.add_boolean_prefix("nation", "N") xapian_query.add_boolean_prefix("language", "L") xapian_query.add_boolean_prefix("document", "D") xapian_query.add_boolean_prefix("reference", "R") xapian_query.add_boolean_prefix("date", "E") xapian_query.add_boolean_prefix("agenda", "A") xapian_query.add_boolean_prefix("vote", "V") xapian_query.add_boolean_prefix("session", "Z") parsed_query = xapian_query.parse_query(query, 16 + 4 + 2 + 1) # allows wildcards print "desc:", parsed_query.get_description() xapian_enquire.set_query(parsed_query) xapian_enquire.set_sort_by_value(0, xapian.Enquire.ASCENDING) xapian_enquire.set_weighting_scheme(xapian.BoolWeight()) # do sorting etc. here matches = xapian_enquire.get_mset(0, 500) print matches.size() for match in matches: #print match print match[4].get_value(0), match[4].get_data()