def tearDown(self): self.setupclass = False # make sure super.tearDown deletes all files super(AdvancedAPI, self).tearDown() FulltextIndex.connect(self.indextype, self.indexlocation, [DocumentRepository()]).destroy() TripleStore.connect(self.storetype, self.storelocation, self.storerepository).clear()
def query(self, environ): # this is needed -- but the connect call shouldn't neccesarily # have to call exists() (one HTTP call) idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) q, param, pagenum, pagelen, stats = self.parse_parameters( environ['QUERY_STRING'], idx) ac_query = environ['QUERY_STRING'].endswith("_ac=true") exclude_types = environ.get('exclude_types', None) boost_types = environ.get('boost_types', None) res, pager = idx.query(q=q, pagenum=pagenum, pagelen=pagelen, ac_query=ac_query, exclude_types=exclude_types, boost_types=boost_types, **param) mangled = self.mangle_results(res, ac_query) # 3.1 create container for results res = {"startIndex": pager['firstresult'] - 1, "itemsPerPage": int(param.get('_pageSize', '10')), "totalResults": pager['totalresults'], "duration": None, # none "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'], "items": mangled} # 4. add stats, maybe if stats: res["statistics"] = self.stats(mangled) return res
def _search_run_query(self, queryparams, boost_repos=None): idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) query = queryparams.get('q') if isinstance(query, bytes): # happens on py26 query = query.decode("utf-8") # pragma: no cover # query += "*" # we use a simple_query_string query by default, # # and we probably want to do a prefix query (eg # # "personuppgiftslag" should match a label field # # containing "personuppgiftslag (1998:204)", # # therefore the "*" # # # maybe not, though -- seems to conflict with # # stemming/indexing, ie "bulvanutredningen*" doesn't match the # # indexed "bulvanutredningen" (which has been stemmed to # # "bulvanutredning" pagenum = int(queryparams.get('p', '1')) qpcopy = dict(queryparams) # we've changed a parameter name in our internal API:s from # "type" to "repo" since ElasticSearch 7.x doesn't have types # anymore (and the corresponding data is now stored in a # "repo" field), but we haven't changed our URL parameters # (yet). In the meantime, map the external type parameter to # the internal repo parameter if 'type' in qpcopy: qpcopy["repo"] = qpcopy.pop("type") for x in ('q', 'p'): if x in qpcopy: del qpcopy[x] res, pager = idx.query(query, pagenum=pagenum, boost_repos=boost_repos, **qpcopy) return res, pager
def _search_run_query(self, queryparams, boost_types=None): idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) query = queryparams.get('q') if isinstance(query, bytes): # happens on py26 query = query.decode("utf-8") # pragma: no cover # query += "*" # we use a simple_query_string query by default, # # and we probably want to do a prefix query (eg # # "personuppgiftslag" should match a label field # # containing "personuppgiftslag (1998:204)", # # therefore the "*" # # # maybe not, though -- seems to conflict with # # stemming/indexing, ie "bulvanutredningen*" doesn't match the # # indexed "bulvanutredningen" (which has been stemmed to # # "bulvanutredning" pagenum = int(queryparams.get('p', '1')) qpcopy = dict(queryparams) for x in ('q', 'p'): if x in qpcopy: del qpcopy[x] res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) return res, pager
def test_create(self): # First do the basic tests super(WhooshBasicIndex,self).test_create() # then do more low-level tests # 1 assert that some files have been created at the specified location self.assertNotEqual(os.listdir(self.location),[]) # 2 assert that it's really a whoosh index self.assertTrue(whoosh.index.exists_in(self.location)) # 3. assert that the actual schema with whoosh types is, in # fact, correct got = self.index.index.schema want = whoosh.fields.Schema( basefile=whoosh.fields.ID(stored=True), dcterms_identifier=whoosh.fields.ID(field_boost=16,stored=True), dcterms_issued=whoosh.fields.DATETIME(stored=True), dcterms_publisher=whoosh.fields.IDLIST(stored=True), dcterms_title=whoosh.fields.TEXT(field_boost=4,stored=True), rdf_type=whoosh.fields.ID(stored=True, field_boost=1.1), # corresponds to URI not Label repo=whoosh.fields.ID(stored=True), text=whoosh.fields.TEXT(stored=True), uri=whoosh.fields.ID(unique=True, stored=True) ) self.assertEqual(sorted(want.names()), sorted(got.names())) for fld in got.names(): self.assertEqual((fld,want[fld]),(fld,got[fld])) # finally, try to create again (opening an existing index # instead of creating) # need mock docrepo self.index = FulltextIndex.connect("WHOOSH", self.location, [DocumentRepository()])
def test_create(self): # First do the basic tests super(WhooshBasicIndex, self).test_create() # then do more low-level tests # 1 assert that some files have been created at the specified location self.assertNotEqual(os.listdir(self.location), []) # 2 assert that it's really a whoosh index self.assertTrue(whoosh.index.exists_in(self.location)) # 3. assert that the actual schema with whoosh types is, in # fact, correct got = self.index.index.schema want = whoosh.fields.Schema( basefile=whoosh.fields.ID(stored=True), dcterms_identifier=whoosh.fields.ID(field_boost=16, stored=True), dcterms_issued=whoosh.fields.DATETIME(stored=True), dcterms_publisher=whoosh.fields.IDLIST(stored=True), dcterms_title=whoosh.fields.TEXT(field_boost=4, stored=True), rdf_type=whoosh.fields.ID( stored=True, field_boost=1.1), # corresponds to URI not Label repo=whoosh.fields.ID(stored=True), text=whoosh.fields.TEXT(stored=True), uri=whoosh.fields.ID(unique=True, stored=True)) self.assertEqual(sorted(want.names()), sorted(got.names())) for fld in got.names(): self.assertEqual((fld, want[fld]), (fld, got[fld])) # finally, try to create again (opening an existing index # instead of creating) # need mock docrepo self.index = FulltextIndex.connect("WHOOSH", self.location, [DocumentRepository()])
def query(self, environ): # this is needed -- but the connect call shouldn't neccesarily # have to call exists() (one HTTP call) idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) q, param, pagenum, pagelen, stats = self.parse_parameters( environ['QUERY_STRING'], idx) ac_query = environ['QUERY_STRING'].endswith("_ac=true") exclude_types = environ.get('exclude_types', None) boost_types = environ.get('boost_types', None) res, pager = idx.query(q=q, pagenum=pagenum, pagelen=pagelen, ac_query=ac_query, exclude_types=exclude_types, boost_types=boost_types, **param) mangled = self.mangle_results(res, ac_query) # 3.1 create container for results res = { "startIndex": pager['firstresult'] - 1, "itemsPerPage": int(param.get('_pageSize', '10')), "totalResults": pager['totalresults'], "duration": None, # none "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'], "items": mangled } # 4. add stats, maybe if stats: res["statistics"] = self.stats(mangled) return res
def setUp(self, mock_requests): can = canned((404, "exists-not.json"), create=CREATE_CANNED, method="get") mock_requests.get.side_effect = can can = canned((200, "create.json"), create=CREATE_CANNED, method="post") mock_requests.put.side_effect = can self.location = "http://localhost:9200/ferenda/" self.index = FulltextIndex.connect("ELASTICSEARCH", self.location, [])
def setUp(self, mock_requests): can = canned((404, "exists-not.json"), create=CREATE_CANNED, method="get") mock_requests.get.side_effect = can can = canned((200, "create.json"), create=CREATE_CANNED, method="put") mock_requests.put.side_effect = can self.location = "http://localhost:9200/ferenda/" self.index = FulltextIndex.connect("ELASTICSEARCH", self.location, [DocumentRepository()])
def queryindex(self, querystring): """Query the system fulltext index and return the IDs/URIs for matching documents. :param querystring: The query :type querystring: str """ index = FulltextIndex.connect(self.config.indextype, self.config.indexlocation) rows = index.query(querystring) for row in rows: print("%s (%s): %s" % (row['identifier'], row['about'], row['text']))
def query(self, request, options=None): # this is needed -- but the connect call shouldn't neccesarily # have to call exists() (one HTTP call) idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) # parse_parameters -> { # "q": "freetext", # "fields": {"dcterms_publisher": ".../org/di", # "dcterms_issued": "2018"} # "pagenum": 1, # "pagelen": 10, # "autocomplete": False, # "exclude_repos": ["mediawiki"], # "boost_repos": [("sfs", 10)], # "include_fragments": False # } if options is None: options = {} options.update(self.parse_parameters(request, idx)) res, pager = idx.query( q=options.get("q"), pagenum=options.get("pagenum"), pagelen=options.get("pagelen"), ac_query=options.get("autocomplete"), exclude_repos=options.get("exclude_repos"), boost_repos=options.get("boost_repos"), include_fragments=options.get("include_fragments"), **options.get("fields")) mangled = self.mangle_results(res, options.get("autocomplete")) # 3.1 create container for results res = { "startIndex": pager['firstresult'] - 1, "itemsPerPage": options["pagelen"], "totalResults": pager['totalresults'], "duration": None, # none "current": request.path + "?" + request.query_string.decode("utf-8"), "items": mangled } # 4. add stats, maybe if options["stats"]: res["statistics"] = self.stats(mangled) # 5. possibly trim results for easier json consumption if options["autocomplete"]: res = res["items"] return res
def test_setup(self): self.location = mkdtemp() self.index = FulltextIndex.connect("WHOOSH", self.location, [DocRepo1(), DocRepo2()]) # introspecting the schema (particularly if it's derived # directly from our definitions, not reverse-engineerded from # a Whoosh index on-disk) is useful for eg creating dynamic # search forms self.assertEqual(self.index.schema(),{'uri':Identifier(), 'repo':Label(), 'basefile':Label(), 'title':Text(boost=4), 'identifier':Label(boost=16), 'text':Text(), 'issued':Datetime(), 'publisher':Label(), 'abstract': Text(boost=2), 'category': Keywords(), 'secret': Boolean(), 'references': URI(), 'category': Keywords()}) shutil.rmtree(self.location)
def setUp(self): self.location = mkdtemp() self.index = FulltextIndex.connect("WHOOSH", self.location, self.repos)
def setUp(self): self.maxDiff = None self.location = "http://localhost:9200/ferenda/" self.index = FulltextIndex.connect("ELASTICSEARCH", self.location, self.repos)
def tearDown(self): self.setupclass = False # make sure super.tearDown deletes all files super(BasicAPI, self).tearDown() FulltextIndex.connect(self.indextype, self.indexlocation, [DocumentRepository()]).destroy()
def setUp(self): self.location = mkdtemp() self.index = FulltextIndex.connect("WHOOSH", self.location, [DocRepo1(), DocRepo2()]) self.load(custom_dataset)