def searchconn(self): """Open a search connection if there isn't one already open. """ if self._sconn is None: self._sconn = xappy.SearchConnection(self.db_path) return self._sconn
def findPMIDsWithSynonyms(self, synonyms): if self.__searchConn == None: self.__searchConn = xappy.SearchConnection(self.__xapianPath) self.__searchConn.reopen() xapian_querys = [] for querystring in synonyms: title, text, keyword, chemical_exact, mesh = '"' + querystring + '"', '"' + querystring + '"', '"' + querystring + '"', '"' + querystring + '"', '"' + querystring + '"' xapian_querys.append(self.__searchConn.query_field('title', title)) xapian_querys.append(self.__searchConn.query_field('text', text)) xapian_querys.append( self.__searchConn.query_field('keyword', keyword)) xapian_querys.append( self.__searchConn.query_field('chemical_exact', chemical_exact)) xapian_querys.append(self.__searchConn.query_field('mesh', mesh)) merged_q = self.__searchConn.query_composite(self.__searchConn.OP_OR, xapian_querys) results = self.__searchConn.search(merged_q, 0, self.__searchConn.get_doccount()) return [r.id for r in results]
def search_query(self, keywords): sconn = xappy.SearchConnection(self.search_db_dir) search = ' '.join(keywords) q = sconn.query_parse(search, default_op=sconn.OP_AND) results = sconn.search(q, 0, sconn.get_doccount()) return map(lambda result: result.data["module_uid"][0], results)
def test(): sconn = xappy.SearchConnection(DBPATH) print sconn.get_doccount(), 'documents loaded.' def query(qtext): q = sconn.query_parse(sconn.spell_correct(qtext), default_op=sconn.OP_AND) return [x.data['id'][0] for x in sconn.search(q, 0, 10)] assert query('biden joe') == ['joe_biden'] assert query('barak obma') == ['barack_obama'] return True
def _get_data(category): # load data from xapian categories db = datastore.corpus_db() sconn = xappy.SearchConnection(settings.XAPIAN_DB) q = sconn.query_field('category', category) offset = 0 limit = 1000 while True: results = sconn.search(q, offset, offset + limit) for key, doc in db.range(key_from, key_to): if not any(uri in doc['url'] for uri in DISCARD_URLS): yield doc
def get_search_connection(self, names): """Get a SearchConnection for a list of connections. :Parameters: - `names`: The name of a collection, or a sequence of names of collections. Returns a SearchConnection object. """ if isinstance(names, types.StringTypes): names = (names, ) assert (len(names) > 0) # FIXME - handle the case of a collection not being found more gracefully. col = self._collections[names[0]] result = xappy.SearchConnection(col.dbpath()) log.debug('Search connection to %r opened' % (names[0], )) # Add the remaining databases to the connection in Result. # FIXME - this should really be done by xappy. Currently, we have to # access the internal _index object and use it directly - which only # works properly if all the collections have the same index properties. for name in names[1:]: col = self._collections[name] result._index.add_database(xapian.Database(col.dbpath())) log.debug('Added %r to search connection' % (name, )) self._handle_count_condition.acquire() try: # Register the callback here. result.append_close_handler(self._search_connection_closed, names) # Now, increment the count of handles. for name in names: newcount = self._handle_count.get(name, 0) + 1 self._handle_count[name] = newcount log.debug('New connection count for %r is %d' % (name, newcount)) finally: self._handle_count_condition.release() return result
def search_query(self, keywords): ''' init docs ''' # Init search connect. search_db_path = os.path.join(UPDATE_DATA_DIR, "search", "zh_CN", "search_db") sconn = xappy.SearchConnection(search_db_path) # Do search. search = ' '.join(keywords).lower() q = sconn.query_parse(search, default_op=sconn.OP_AND) results = sconn.search(q, 0, sconn.get_doccount(), sortby="have_desktop_file") all_results = map(lambda result: result.data["pkg_name"][0], results) for keyword in keywords: match_names = self.get_pkgs_match_input(keyword) for name in match_names: if name not in all_results: all_results.append(name) return all_results
def main(args): # search corpus index db = datastore.corpus_db() sconn = xappy.SearchConnection(settings.XAPIAN_DB) query = ' '.join(args.query) print "Search {} documents for '{}'".format(sconn.get_doccount(), args.query) q = sconn.query_parse(query, default_op=sconn.OP_AND) if args.category: qc = q.compose( q.OP_OR, [sconn.query_field('category', c) for c in args.category]) q = q & qc if args.date: qd = q.compose(q.OP_OR, [sconn.query_field('date', d) for d in args.date]) q = q & qd if args.date_start and args.date_end: qr = sconn.query_range('date', args.date_start, args.date_end) q = q.filter(qr) if args.sort: sortby = [tuple(args.sort.split(','))] else: sortby = None print 'Query: {!r}'.format(q) results = execute_query(sconn, q, args.offset, args.limit, getfacets=args.facet, allowfacets=('category', ), sortby=sortby) if results.estimate_is_exact: print "Found {} results".format(results.matches_estimated) else: print "Found approximately {} results".format( results.matches_estimated) for i, result in enumerate(results, 1): doc = db[result.id] try: cat = result.get_terms('category').next() except StopIteration: cat = 'none' try: date = result.get_terms('date').next() except StopIteration: date = 'none' print "{:2}. {} -- {} -- {}\n\t{}\n\t{}\n".format( i, cat, doc['headline'], date, doc['url'], result.id) from IPython import embed embed()
#!/usr/bin/env python # -*- coding: UTF-8 -*- """ Copyright (c) 2014, Kersten Doering <*****@*****.**>, Bjoern Gruening <*****@*****.**> """ #Kersten Doering 04.06.2014 #check https://github.com/miracle2k/xappy/blob/master/docs/introduction.rst for nice examples import xappy searchConn = xappy.SearchConnection("xapian/xapian2015") searchConn.reopen() ######################### querystring = "pancreatic" q = searchConn.query_field('title', querystring) print "search query: ", q #save all machting documents in "results" (starting with rank 0 - check help documentation of function "search") results = searchConn.search(q, 0, searchConn.get_doccount()) print "number of matches: ", results.matches_estimated ### debug: ### #print first 5 titles with highlight function and save first 1000 titles in an HTML file #print "### first 5 hits: ###" #print "Rank\tPubMed-ID\tTitle (query term highlighted)"
def main(): tornado.options.parse_command_line() from apps.main.models import User from apps.questions.models import Question, Genre from mongokit import Connection con = Connection() con.register([Question, Genre, User]) db = con.gkc if options.reindex_all: since = datetime.datetime(1979, 12, 13) else: since = options.since if not since: since = default_since try: since = datetime.datetime.strptime(since, '%Y-%m-%d %H-%M-%S') except ValueError: since = datetime.datetime.strptime(since, '%Y-%m-%d') if options.verbose: print 'since', since genres = {} authors = {} count = 0 search = {'modify_date': {'$gt': since}} if not db.Question.find(search).count(): if options.verbose: print "0 questions" if not options.test: return youngest = since indexer = xappy.IndexerConnection(settings.XAPIAN_LOCATION) if not indexer.get_fields_with_actions() or options.update_fields: indexer.add_field_action('question', xappy.FieldActions.INDEX_FREETEXT, weight=2, language='en', spell=True, stop=stopwords) indexer.add_field_action( 'answer', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True, ) indexer.add_field_action('accept', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('alternatives', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('comment', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=False, search_by_default=False, stop=stopwords) indexer.add_field_action('date', xappy.FieldActions.SORTABLE, type="date") indexer.add_field_action('state', xappy.FieldActions.SORTABLE) indexer.add_field_action('question', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('answer', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('state', xappy.FieldActions.STORE_CONTENT) t0 = time.time() for question in db.Question.collection.find(search): if question['modify_date'] > youngest: youngest = question['modify_date'] doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('state', question['state'])) doc.fields.append(xappy.Field('question', question['text'])) doc.fields.append(xappy.Field('answer', question['answer'])) if question['genre'].id in genres: genre = genres[question['genre'].id] else: genre = db.Genre.one({'_id': question['genre'].id}) genre = genre.name genres[question['genre'].id] = genre doc.fields.append(xappy.Field('genre', genre)) if question['author'].id in authors: author = authors[question['author'].id] else: author = db.User.one({'_id': question['author'].id}) author = author.username authors[question['author'].id] = author doc.fields.append(xappy.Field('author', author)) doc.fields.append(xappy.Field('comment', question['comment'])) doc.fields.append(xappy.Field('accept', '\n'.join(question['accept']))) doc.fields.append( xappy.Field('alternatives', '\n'.join(question['alternatives']))) doc.id = str(question['_id']) pdoc = indexer.process(doc) indexer.replace(pdoc) count += 1 #if count and not count % 100: # indexer.flush() # add a second to avoid milliseconds causing the same doc to be index over and over youngest += datetime.timedelta(seconds=1) open(since_filename, 'w').write(youngest.strftime('%Y-%m-%d %H-%M-%S\n')) indexer.flush() t1 = time.time() indexer.close() if options.verbose: print round(t1 - t0, 3), "seconds to index", count, "questions" # test if options.test: print settings.XAPIAN_LOCATION searcher = xappy.SearchConnection(settings.XAPIAN_LOCATION) text = 'FRAMCEs capitalls' text = "Capitol STATE" print searcher.spell_correct(text) query = searcher.query_field('question', text, default_op=searcher.OP_OR) results = searcher.search(query, 0, 10) print results.matches_estimated #print results.estimate_is_exact for result in results: print result.rank, result.id print repr(result.summarise('question')), result.data['state'][0] #result.data['state'] text = 'london' query = searcher.query_field('answer', text, default_op=searcher.OP_OR) results = searcher.search(query, 0, 10) print results.matches_estimated #print results.estimate_is_exact for result in results: print result.rank, result.id print repr(result.summarise('question')), result.data['state'][0]
# log start time start = time.asctime() # set options verbose, output and whether PostgreSQL should be used to display results output = True verbose = True debug = False use_psql = True # get path to this script root = os.getcwd() # search connection to Xapian full text index xapianPath = os.path.join(root, "xapian_PMC_complete") searchConn = xappy.SearchConnection(xapianPath) searchConn.reopen() # get PMC texts from PostgreSQL def get_text(pmcid): stmt = """ SELECT text FROM public.tbl_pmcid_text WHERE pmcid = %s ; """ cursor.execute(stmt, (pmcid, ))
def searcher(self): path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) return xappy.SearchConnection(path)
def query(s): sconn = xappy.SearchConnection(DBPATH) q = sconn.query_parse(sconn.spell_correct(s), default_op=sconn.OP_AND) return [x.data['id'][0] for x in sconn.search(q, 0, 10)]
def create_index(self): self.iconn = xappy.IndexerConnection(self.dbpath) self.sconn = xappy.SearchConnection(self.dbpath) # keys are filtered package names or "_last_run_" self.iconn.add_field_action('key', xappy.FieldActions.INDEX_EXACT)
def search(s, page=0): conn = xappy.SearchConnection(config.search_db) q = conn.query_parse(conn.spell_correct(s)) result = conn.search(q, page*20, page*20+20) return result.matches_estimated, [x.data for x in result]
def get_context_data(self): # Get the query text q = self.request.GET.get('q', '') # Get the offset value try: offset = int(self.request.GET.get('offset', '0')) if offset < 0: offset = 0 except ValueError: offset = 0 # Is it a special search? special_value = self.request.redis_conn.get("special_search:%s:%s" % ( self.request.mission.name, q, )) if special_value: self.template_name = "search/special.html" return { "q": q, "text": special_value, } # Get the results from Xapian db = xappy.SearchConnection( os.path.join( settings.SITE_ROOT, '..', "xappydb", ), ) query = db.query_parse( q, default_op=db.OP_OR, deny=["mission"], ) query = db.query_filter( query, db.query_composite(db.OP_AND, [ db.query_field("mission", self.request.mission.name), db.query_field("transcript", self.request.mission.main_transcript), ])) results = db.search( query=query, startrank=offset, endrank=offset + PAGESIZE, checkatleast= -1, # everything (entire xapian db fits in memory, so this should be fine) sortby="-weight", ) # Go through the results, building a list of LogLine objects redis_conn = self.request.redis_conn log_lines = [] for result in results: transcript_name, timestamp = result.id.split(":", 1) log_line = LogLine(redis_conn, transcript_name, int(timestamp)) log_line.speaker = Character(redis_conn, transcript_name.split('/')[0], result.data['speaker_identifier'][0]) log_line.title = mark_safe( result.summarise("text", maxlen=50, ellipsis='…', strict_length=True, hl=None)) log_line.summary = mark_safe( result.summarise("text", maxlen=600, ellipsis='…', hl=('<mark>', '</mark>'))) log_lines.append(log_line) def page_url(offset): return reverse("search") + '?' + urllib.urlencode( { 'q': q.encode('utf-8'), 'offset': offset, }) if offset == 0: previous_page = False else: previous_page = page_url(offset - PAGESIZE) if offset + PAGESIZE > results.matches_estimated: next_page = False else: next_page = page_url(offset + PAGESIZE) thispage = offset / PAGESIZE maxpage = results.matches_estimated / PAGESIZE pages_to_show = set([0]) | set([thispage - 1, thispage, thispage + 1 ]) | set([maxpage]) if 0 == thispage: pages_to_show.remove(thispage - 1) if maxpage == thispage: pages_to_show.remove(thispage + 1) pages = [] class Page(object): def __init__(self, number, url, selected=False): self.number = number self.url = url self.selected = selected pages_in_order = list(pages_to_show) pages_in_order.sort() for page in pages_in_order: if len(pages) > 0 and page != pages[-1].number: pages.append('...') pages.append( Page(page + 1, page_url(page * PAGESIZE), page == thispage)) error_info = self.request.redis_conn.hgetall( "error_page:%s:%s" % ( self.request.mission.name, 'no_search_results', ), ) if not error_info: error_info = {} if error_info.has_key('classic_moment_quote'): error_quote = LogLine( self.request.redis_conn, self.request.mission.main_transcript, timestamp_to_seconds(error_info['classic_moment_quote'])) else: error_quote = None return { 'log_lines': log_lines, 'result': results, 'q': q, 'previous_page': previous_page, 'next_page': next_page, 'pages': pages, 'debug': { 'query': query, }, 'error': { 'info': error_info, 'quote': error_quote, } }
def testFileTypeSearch(self): conn = xappy.SearchConnection(self.col.dbpath()) res = conn.search(conn.query_field('filetype', 'htm'), 0, 10) results = [r for r in res] self.assertEqual(len(results), 1)