def depatech_published_data_crawl_handler(request): """Crawl published-data at MTC depa.tech""" # Get hold of query expression and filter query = SmartBunch({ 'expression': request.params.get('expression', ''), 'filter': request.params.get('filter', ''), }) log.info('query: {}'.format(query)) if should_be_quoted(query.expression): query.expression = '"%s"' % query.expression # constituents: abstract, biblio and/or full-cycle constituents = request.matchdict.get('constituents', 'full-cycle') #print 'constituents:', constituents chunksize = int(request.params.get('chunksize', '5000')) try: result = depatech_crawl(constituents, query, chunksize) return result except Exception as ex: request.errors.add('depatech-crawl', 'crawl', str(ex)) log.error(request.errors) log.error('query="{0}", exception:\n{1}'.format(query, _exception_traceback()))
def issue_reporter_handler(request): targets = request.params.get('targets') report_data = request.json report_data.setdefault('application', {}) report = SmartBunch.bunchify(report_data) # Add user information to issue report user = request.user if user: # Anonymize sensitive user data user.password = None user.upstream_credentials = None # Serialize user object and attach to report report.application.user = SmartBunch(json.loads(user.to_json())) # Send the whole beast to the standard application log log.error('Issue report [{targets}]:\n{report}'.format( report=report.pretty(), targets=targets )) # TODO: Store the issue report into database # TODO: What about other targets like "log:error", "log:warning", "human:support", "human:user"? # Send email report for target in read_list(targets): if target.startswith('email:'): recipient = target.replace('email:', '') email_issue_report(report, recipient)
def get_email_settings(self, vendor): """ Read default/global email settings and update with per-vendor email settings. """ # Container for email settings email_settings = SmartBunch({ 'addressbook': [], 'content': SmartBunch(), }) for setting_name in ['addressbook', 'content']: setting_key = 'email_{}'.format(setting_name) defaults = self.application_settings.get(setting_key) specific = self.application_settings.get(setting_key + ':' + vendor) thing = deepcopy(defaults) if defaults and specific: thing.update(deepcopy(specific)) for key, value in thing.items(): thing[key] = value.decode('utf-8') email_settings[setting_name] = thing return email_settings
def read(self): # Read metadata """ out: "meta": { "status": "success", "params": { "sort": "pd desc, ucid asc", "rows": "250", "indent": "true", "qt": "premium", "timeAllowed": "300000", "q": "text:vibrat* AND (ic:G01F000184 OR cpc:G01F000184)", "start": "0", "wt": "json", "fl": "ucid,fam" }, "pager": { "totalEntries": 6872, "entriesOnThisPage": 250, "firstPage": 1, "lastPage": 28, "previousPage": null, "currentPage": 1, "entriesPerPage": "250", "nextPage": 2 }, "name": "ifi", "time": "4.836163" } """ self.meta.upstream.update({ 'name': 'ifi', 'time': self.input['time'], 'status': self.input['status'], 'params': SmartBunch.bunchify( self.input['content']['responseHeader']['params']), 'pager': SmartBunch.bunchify(self.input['content']['responseHeader'].get( 'pager', {})), }) self.meta.navigator.count_total = int( self.meta.upstream.pager.totalEntries) self.meta.navigator.count_page = int( self.meta.upstream.pager.entriesOnThisPage) self.meta.navigator.offset = int(self.meta.upstream.params.start) self.meta.navigator.limit = int(self.meta.upstream.params.rows) self.meta.navigator.postprocess = SmartBunch() # Read content self.documents = self.input['content']['response']['docs'] self.read_documents()
def get_datasource_settings(self): # Container for datasource settings datasource_settings = SmartBunch({ 'datasources': [], 'datasource': SmartBunch(), 'total': SmartBunch.bunchify({'fulltext_countries': [], 'details_countries': []}), }) # Read datasource settings from configuration datasource_settings.datasources = read_list(self.application_settings.get('ip_navigator', {}).get('datasources')) datasource_settings.protected_fields = read_list(self.application_settings.get('ip_navigator', {}).get('datasources_protected_fields')) for datasource in datasource_settings.datasources: settings_key = 'datasource:{name}'.format(name=datasource) datasource_info = self.application_settings.get(settings_key, {}) datasource_info['fulltext_enabled'] = asbool(datasource_info.get('fulltext_enabled', False)) datasource_info['fulltext_countries'] = read_list(datasource_info.get('fulltext_countries', '')) datasource_info['details_enabled'] = asbool(datasource_info.get('details_enabled', False)) datasource_info['details_countries'] = read_list(datasource_info.get('details_countries', '')) datasource_settings.datasource[datasource] = SmartBunch.bunchify(datasource_info) # Aggregate data for all countries datasource_settings.total.fulltext_countries += datasource_info['fulltext_countries'] return datasource_settings
def __init__(self, input, options=None): # Input data and options self.input = input self.options = options and SmartBunch.bunchify(options) or SmartBunch() # Setup data structures self.setup() # Read input information self.read() # Run data munging actions if 'feature_family_remove' in self.options and self.options.feature_family_remove: self.remove_family_members()
def ificlaims_client(options=None): options = options or SmartBunch() if 'vendor' in options and options.vendor == 'serviva': client = get_serviva_client() else: client = get_ificlaims_client() return client
def status_upstream_ificlaims_handler(request): client = ificlaims_client() query = SmartBunch({ 'expression': 'pn:EP0666666', }) data = client.search_real(query) assert data, 'Empty response from IFI CLAIMS' return "OK"
def status_upstream_depatech_handler(request): client = get_depatech_client() query = SmartBunch({ 'expression': '(PC:DE AND DE:212016000074 AND KI:U1) OR AN:DE212016000074U1 OR NP:DE212016000074U1', }) data = client.search_real(query) assert data, 'Empty response from MTC depa.tech' return "OK"
def setup(self): # Documents from upstream data source self.documents = [] # Metadata information, upstream (raw) and downstream (unified) self.meta = SmartBunch.bunchify({ 'navigator': {}, 'upstream': {}, }) # Output information, upstream (raw) and downstream (unified) self.output = SmartBunch.bunchify({ 'meta': {}, 'numbers': [], 'details': [], 'navigator': {}, })
def make_request(client): #results = client.search('*:*') #pprint(results) #results = client.search('pa:siemens', 0, 10) #results = client.search('pa:siemens OR pa:bosch', 0, 10) #results = client.search('pa:(siemens OR bosch)', 0, 10) #results = client.search('text:"solar energy"', 0, 10) #results = client.search(SmartBunch({'expression': 'text:solar energy'}), SmartBunch({'offset': 0, 'limit': 10})) results = client.search( SmartBunch( {'expression': '{!complexphrase inOrder=true}"siemen* *haus"'}), SmartBunch({ 'offset': 0, 'limit': 10 })) #results = client.search(u'text:抑血管生成素的药物用途', 0, 10) #results = client.search(u'text:放射線を照射する放射線源と', 0, 10) #results = client.search(SmartBunch({'expression': 'pnctry:(de OR ep OR wo OR cn OR jp OR tw) AND pa:"taiwan paiho" AND pd:[20170101 TO 20170731]'}), SmartBunch({'offset': 0, 'limit': 50})) print json.dumps(results)
def get_vendor_settings(self): # Container for vendor settings vendor_settings = SmartBunch({ 'vendors': [], 'vendor': SmartBunch(), }) # Read vendor settings from configuration try: vendor_settings.vendors = read_list(self.application_settings.ip_navigator.vendors) assert vendor_settings.vendors except: raise ConfigurationError('No vendor configured in "{configfile}"'.format(configfile=self.configfile)) for vendor in vendor_settings.vendors: settings_key = 'vendor:{name}'.format(name=vendor) if settings_key not in self.application_settings: raise ConfigurationError('Vendor "{vendor}" not configured in "{configfile}"'.format( vendor=vendor, configfile=self.configfile)) vendor_info = self.application_settings.get(settings_key, {}) for key, value in vendor_info.iteritems(): vendor_info[key] = value.decode('utf-8') if 'hostname_matches' in vendor_info: vendor_info.hostname_matches = read_list(vendor_info.hostname_matches) vendor_info.email = self.get_email_settings(vendor) vendor_settings.vendor[vendor] = SmartBunch.bunchify(vendor_info) return vendor_settings
def datasource_settings(self): """ Return datasource settings while accounting for sensible settings like API URI and credentials. """ request = get_current_request() datasource_settings = SmartBunch.bunchify(request.registry.datasource_settings) if 'protected_fields' in datasource_settings: for fieldname in datasource_settings.protected_fields: for name, settings in datasource_settings.datasource.iteritems(): if fieldname in settings: del settings[fieldname] del datasource_settings['protected_fields'] return datasource_settings
def depatech_search(query, options=None): options = options or SmartBunch() client = get_depatech_client() try: data = client.search(query, options) # Raise an exception on empty results to skip caching this response if data.meta.navigator.count_total == 0: raise NoResultsException('No results', data=data) return data except SearchException as ex: client.stale = True raise
def remove_family_members(self): # Filtering mechanics: Deduplicate by family id seen = {} removed = [] removed_map = defaultdict(list) stats = SmartBunch(removed=0) def family_remover(item): fam = self.document_to_family_id(item) # Sanity checks on family id # Do not remove documents without valid family id if not fam or fam in ['0', '-1']: return True # "Seen" filtering logic if fam in seen: stats.removed += 1 removed.append(item) removed_map[fam].append(item) return False else: seen[fam] = True #print 'representative: {rep} [{fam}]'.format(rep=item['publication_number'], fam=fam) return True # Update metadata and content # 1. Apply family cleansing filter to main documents response self.documents = list(filter(family_remover, self.documents)) #print 'removed_map:'; pprint(removed_map) # 2. Add list of removed family members to output self.output.navigator.family_members = {'removed': removed} #self.output['family-members-removed'] = removed # 3. Update metadata self.meta.navigator.postprocess.action = 'feature_family_remove' self.meta.navigator.postprocess.info = stats
def read(self): # Read metadata """ in: "info": { "Info": "Search processed in 2905", "Success": "true", "ResultLength": 250, "FamCount": "1200", "DocCount": "5432", "MemCount": "3599", "Limit": 250, "Offset": 0, "ResultSetId": "4153687" }, """ self.meta.upstream.update(self.input['info']) self.meta.upstream.update({ 'name': 'sip', # TODO: Reference from IFI CLAIMS, fill up/unify. #'time': self.input['time'], #'status': self.input['status'], #'params': SmartBunch.bunchify(self.input['content']['responseHeader']['params']), #'pager': SmartBunch.bunchify(self.input['content']['responseHeader'].get('pager', {})), }) self.meta.navigator.count_total = int(self.meta.upstream.MemCount) self.meta.navigator.count_page = len(self.input['results']) self.meta.navigator.offset = int(self.meta.upstream.Offset) self.meta.navigator.limit = int(self.meta.upstream.Limit) self.meta.navigator.postprocess = SmartBunch() # Read content """ in: "results": [{ }], """ self.documents = self.input['results'] self.read_documents()
def read(self): #print 'input:', self.input # Read metadata """ input: { "_shards": { "failed": 0, "successful": 5, "total": 5 }, "hits": { "hits": [ { "_id": "DE.000202013003344.U1", "_index": "deparom", "_score": 13.234067, "_source": { "AB": "<p num=\"0000\">Rettungsensemble (1) mit Seilklemmen (2, 3), dadurch gekennzeichnet, dass es folgende, miteinander über einen Seilzug (4) verbundene Komponenten umfasst: <br/>– eine erste Seilklemme (2) mit wenigstens einer Umlenkrolle (21), zum verschieblichen Fixieren des Körpers des Benutzers an einem Seil; <br/>– eine zweite Seilklemme (3) mit wenigstens einer Umlenkrolle (31), zur verschieblichen Befestigung an dem Seil oberhalb der Position der ersten Seilklemme (2); wobei <br/>– der Seilzug (4) mit seinem einen Ende an der zweiten Seilklemme (3) oder einer ihrer Umlenkrollen (31) befestigt und in die Umlenkrollen der ersten und zweiten Seilklemme eingelegt ist, diese miteinander verbindet und zusammen mit diesen einen Flaschenzug bildet, und dessen anderes Ende frei hängt und zur Bedienung des Flaschenzugs vorgesehen ist.</p><p num=\"\"><de-figure num=\"0\"></de-figure></p>", "AD": "20130410", "AN": "DE202013003344", "DE": "202013003344", "DP": "20131205", "GT": "Rettungsensemble zur Bergung aus Gletscherspalten", "IC": [ "A63B", "A63B0029", "A63B002900" ], "KI": "U1", "MC": [ "A63B", "A63B0029", "A63B002900" ], "NP": "CH64912", "PA": "Mammut Sports Group AG, Seon, CH", "PC": "DE", "PD": "20120509", "RN": "Bogensberger Patent- & Markenbüro, Eschen, LI" }, "_type": "DEP" } ], "max_score": 13.234067, "total": 1 }, "timed_out": false, "took": 7 } """ self.meta.upstream.update({ 'name': 'depatech', 'time': self.input['took'], 'status': 'success', #'params': SmartBunch.bunchify(self.input['content']['responseHeader']['params']), #'pager': SmartBunch.bunchify(self.input['content']['responseHeader'].get('pager', {})), }) self.meta.navigator.count_total = int(self.input['hits']['total']) #self.meta.navigator.count_page = int(self.meta.upstream.pager.entriesOnThisPage) self.meta.navigator.offset = int(self.options.offset) self.meta.navigator.limit = int(self.options.limit) self.meta.navigator.max_hits = int(self.options.max_hits) self.meta.navigator.postprocess = SmartBunch() # Read content self.documents = self.input['hits']['hits'] self.read_documents()
def search_real(self, query, options=None): query.setdefault('filter', '') options = options or SmartBunch() options.setdefault('offset', 0) options.setdefault('limit', self.pagesize) offset = options.offset limit = options.limit log.info( u"{backend_name}: searching documents, expression='{0}', offset={1}, limit={2}; user={username}" .format(query.expression, offset, limit, **self.__dict__)) if not self.token or self.stale: self.login() starttime = timeit.default_timer() # Define search request URI # https://cdws.ificlaims.com/search/query?q=pa:facebook # https://cdws.ificlaims.com/search/query?q=*:*&fl=ucid&rows=1 uri = self.uri + self.path_search # Define search request parameters # 'family.simple': True, params = { 'q': query.expression, 'fq': query.filter, 'sort': 'pd desc, ucid asc', 'fl': 'ucid,fam', 'start': offset, 'rows': limit, } log.info( u'IFI CLAIMS search. query={query}, uri={uri}, params={params}, options={options}' .format(query=query, uri=uri, params=params, options=options.dump())) # Perform search request headers = self.get_authentication_headers() headers.update({'Accept': 'application/json'}) try: response = requests.get(uri, params=params, headers=headers, verify=self.tls_verify) except RequestException as ex: self.logout() raise self.search_failed( ex=ex, user_info= 'Error or timeout while connecting to upstream database. Database might be offline.', meta={ 'username': self.username, 'uri': uri }) duration = timeit.default_timer() - starttime #print "response:", response.content # debugging # Process search response if response.status_code == 200: #print "response:", response.content # debugging response_data = json.loads(response.content) if response_data['status'] == 'success': # Debugging: Simulate error #response_data['content']['error'] = {'code': 503, 'msg': 'no servers hosting shard'} # Handle search expression errors if 'error' in response_data['content']: upstream_error = response_data['content']['error'] if 'msg' not in upstream_error: upstream_error['msg'] = 'Reason unknown' message = u'Response status code: {code}\n\n{msg}'.format( **upstream_error) # Enrich "maxClauseCount" message, e.g. raised by {!complexphrase}text:"auto* AND leucht*"~5 if upstream_error[ "code"] == 500 and u'maxClauseCount is set to' in upstream_error[ "msg"]: raise self.search_failed( user_info= u'Too many terms in phrase expression, wildcard term prefixes might by too short.', message=message, response=response) # Enrich "no servers hosting shard" message elif upstream_error["code"] == 503 and \ ( u'no servers hosting shard' in upstream_error["msg"] or \ u'No server is available' in upstream_error["msg"] ): raise self.search_failed( user_info= u'Error while connecting to upstream database. Database might be offline.', message=message, response=response) # Regular traceback elif upstream_error[ "code"] == 500 and 'trace' in upstream_error: message = u'Response status code: {code}\n\n{trace}'.format( **upstream_error) raise self.search_failed( user_info=u'Unknown exception at search backend', message=message, response=response) # Enrich "SyntaxError" exception elif upstream_error[ "code"] == 400 and u'ParseException' in upstream_error[ "msg"]: user_info = re.sub( r'.*(Encountered.*at line.*?\.).*', r'SyntaxError, can not parse query expression: \1', upstream_error["msg"], flags=re.DOTALL) raise self.search_failed(user_info=user_info, message=message, response=response) else: raise self.search_failed(user_info=message, response=response) # Mogrify search response # TODO: Generalize between all search backends sr = IFIClaimsSearchResponse(response_data, options=options) result = sr.render() duration = round(duration, 1) # TODO: Unify between IFI CLAIMS and SIP log.info( '{backend_name}: Search succeeded. duration={duration}s, meta=\n{meta}' .format(duration=duration, meta=result['meta'], **self.__dict__)) if not result['numbers']: log.warn( '{backend_name}: Search had empty results. duration={duration}s, meta=\n{meta}' .format(duration=duration, meta=result['meta'], **self.__dict__)) return result elif response_data['status'] == 'error': user_info = None if response_data[ 'message'] == 'JSON error: failed to read response object': user_info = u'Error while connecting to upstream database. Database might be offline.' raise self.search_failed(user_info=user_info, message=response_data['message'], response=response) else: raise self.search_failed('Search response could not be parsed', response=response) else: # print "response:", response.content # debugging self.logout() # Strip HTML from response body response_content = response.content if response.headers['Content-Type'].startswith('text/html'): response_content = re.sub('<[^<]+?>', '', response_content).strip().replace( '\r\n', ', ') # Build alternative basic error structure upstream_error = { 'code': response.status_code, 'reason': response.reason, 'content': response_content, } message = json.dumps(upstream_error) raise self.search_failed( user_info= u'Error while connecting to upstream database. Database might be offline.', message=message, response=response) raise self.search_failed(response=response)
def get_datasource_settings(self, vendor=None): # Container for datasource settings. datasource_settings = SmartBunch({ 'datasources': [], 'datasource': SmartBunch(), 'total': SmartBunch.bunchify({'fulltext_countries': [], 'details_countries': []}), }) # Read datasource settings from configuration. datasource_settings.datasources = read_list(self.application_settings.get('ip_navigator', {}).get('datasources')) datasource_settings.protected_fields = read_list(self.application_settings.get('ip_navigator', {}).get('datasources_protected_fields')) for datasource in datasource_settings.datasources: datasource_info = SmartBunch() if vendor is None: settings_key = 'datasource:{name}'.format(name=datasource) else: settings_key = 'datasource:{name}:{vendor}'.format(name=datasource, vendor=vendor) ds_settings = self.application_settings.get(settings_key, {}) datasource_info.setdefault('fulltext_enabled', asbool(ds_settings.get('fulltext_enabled', False))) datasource_info.setdefault('fulltext_countries', read_list(ds_settings.get('fulltext_countries', ''))) datasource_info.setdefault('details_enabled', asbool(ds_settings.get('details_enabled', False))) datasource_info.setdefault('details_countries', read_list(ds_settings.get('details_countries', ''))) for key, value in ds_settings.iteritems(): datasource_info.setdefault(key, value) datasource_settings.datasource[datasource] = SmartBunch.bunchify(datasource_info) # Aggregate data for all countries. datasource_settings.total.fulltext_countries += datasource_info['fulltext_countries'] return datasource_settings
def ificlaims_published_data_search_handler(request): """Search for published-data at IFI CLAIMS Direct""" # Get hold of query expression and filter query = SmartBunch({ 'expression': request.params.get('expression', ''), 'filter': request.params.get('filter', ''), }) log.info('Query: {}'.format(query)) # Parse expression, extract and propagate keywords to user interface parser = IFIClaimsParser(query.expression) propagate_keywords(request, parser) # Fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted if should_be_quoted(query.expression): query.expression = '"%s"' % query.expression # Lazy-fetch more entries # TODO: get from patzilla.access.ificlaims limit = 250 offset_local = int(request.params.get('range_begin', 0)) offset_remote = int(offset_local / limit) * limit # Compute query options, like # - limit # - sorting # - whether to remove family members options = SmartBunch() options.update({ 'limit': limit, 'offset': offset_remote, }) # Propagate request parameters to search options parameters request_to_options(request, options) try: data = ificlaims_search(query, options) #print data.prettify() # debugging return data except LoginException as ex: request.errors.add('ificlaims-search', 'login', ex.details) log.warn(request.errors) except SyntaxError as ex: request.errors.add('ificlaims-search', 'expression', unicode(ex.msg)) log.warn(request.errors) except SearchException as ex: message = ex.get_message() request.errors.add('ificlaims-search', 'search', message) log.warn(request.errors) except NoResultsException as ex: # Forward response to let the frontend recognize zero hits request.response.status = HTTPNotFound.code return ex.data except OperationFailure as ex: message = unicode(ex) request.errors.add('ificlaims-search', 'internals', message) log.error(request.errors) except Exception as ex: message = handle_generic_exception(request, ex, 'ificlaims-search', query) request.errors.add('ificlaims-search', 'search', message)
def depatech_published_data_search_handler(request): """Search for published-data at MTC depa.tech""" # Get hold of query expression and filter expression = request.params.get('expression', '') filter = request.params.get('filter', '') query = SmartBunch({ 'syntax': 'lucene', 'expression': expression, 'filter': filter, }) if expression.startswith('DEPAROM V1.0') or expression.startswith('deparom:'): query.syntax = 'deparom' log.info('Query: {}'.format(query)) # Parse expression, extract and propagate keywords to user interface if query.syntax == 'lucene': parser = DepaTechParser(query.expression) keywords_to_response(request, parser) # TODO: Parse DEPAROM query expression and extract keywords # Fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted if should_be_quoted(query.expression): query.expression = '"%s"' % query.expression # Lazy-fetch more entries # TODO: get from patzilla.access.depatech limit = 250 offset_local = int(request.params.get('range_begin', 0)) offset_remote = int(offset_local / limit) * limit # Compute query options, like # - limit # - sorting # - whether to remove family members options = SmartBunch() options.update({ 'limit': limit, 'offset': offset_remote, }) # Propagate request parameters to search options parameters request_to_options(request, options) try: data = depatech_search(query, options) #print data.prettify() # debugging return data except LoginException as ex: request.errors.add('depatech-search', 'login', ex.details) log.warn(request.errors) except SyntaxError as ex: request.errors.add('depatech-search', 'expression', str(ex.msg)) log.warn(request.errors) except SearchException as ex: message = ex.get_message() request.errors.add('depatech-search', 'search', message) log.warn(request.errors) except NoResultsException as ex: # Forward response to let the frontend recognize zero hits request.response.status = HTTPNotFound.code return ex.data except OperationFailure as ex: message = str(ex) request.errors.add('depatech-search', 'internals', message) log.error(request.errors) except Exception as ex: message = handle_generic_exception(request, ex, 'depatech-search', query) request.errors.add('depatech-search', 'search', message)
def search(self, expression, options=None): options = options or SmartBunch() options.setdefault('offset', 0) options.setdefault('limit', self.pagesize) offset = options.offset limit = options.limit log.info( "{backend_name}: searching documents, expression='{0}', offset={1}, limit={2}" .format(expression, offset, limit, **self.__dict__)) if not self.sessionid or self.stale: self.login() starttime = timeit.default_timer() try: response = requests.post(self.uri + '/search/new', data={ 'session': self.sessionid, 'searchtree': expression }) except (ConnectionError, ConnectTimeout) as ex: log.error( 'SIP search for user "{username}" at "{uri}" failed. Reason: {0} {1}.' .format(ex.__class__, ex.message, username=self.username, uri=self.uri)) self.logout() raise SearchException( ex.message, sip_info= 'Error or timeout while connecting to upstream database. Database might be offline.' ) # Process search response if response.status_code == 200: #print "SIP search response (raw)"; print response.content # debugging try: search_response = self._search_parse_xml(response.content) if search_response['success'] == 'false': raise SearchException( 'Search failed', sip_response=search_response['response']) if 'ResultSetId' in search_response['data']: search_info = search_response['data'] ResultSetId = search_info['ResultSetId'] # Inject offset and limit into metadata, pretend it comes from server search_info['Offset'] = offset search_info['Limit'] = limit # perform second request to actually retrieve the results by ResultSetId search_results = self.getresults(ResultSetId, options) #print "SIP search results:", search_results duration = timeit.default_timer() - starttime log.info( 'Search succeeded. duration={0}s, search_info={1}'. format(round(duration, 1), search_info)) upstream_response = { 'info': search_info, 'results': search_results or [], } # Mogrify search response # TODO: Generalize between all search backends sr = SipSearchResponse(upstream_response, options=options) result = sr.render() duration = round(duration, 1) # TODO: Unify between SIP and IFI CLAIMS log.info( '{backend_name}: Search succeeded. duration={duration}s, meta=\n{meta}' .format(duration=duration, meta=result['meta'].prettify(), **self.__dict__)) if not result['numbers']: log.warn( '{backend_name} search from "{user}" for "{expression}" had empty results.' .format(user=self.username, expression=expression, **self.__dict__)) return result else: message = 'Search failed. Reason: Upstream response lacks valid ResultSetId. content={0}'.format( response.text) raise SearchException( message, sip_info= 'Search failed. Search response could not be parsed.') except Exception as ex: log.error( 'Search failed. {name}: {message}. expression={expression}, response={response}' .format(name=ex.__class__.__name__, message=ex.message, response=response.text, expression=expression)) raise else: response_status = str(response.status_code) + ' ' + response.reason message = 'SIP search failed. Reason: response status != 200. status={0}, content={1}'.format( response_status, response.text) log.error(message) raise SearchException( message, sip_info= 'HTTP error "{status}" while searching upstream database'. format(status=response_status))
def search_real(self, query, options=None): options = options or SmartBunch() options.setdefault('offset', 0) options.setdefault('limit', self.pagesize) options.setdefault('max_hits', self.search_max_hits) offset = options.offset limit = options.limit transport = 'querystring' # Use DEPAROM Query Translator # https://depa.tech/api/manual/dqt-translator/ # https://api.depa.tech/dqt/query/es if query.expression and query.syntax == 'deparom': transport = 'json' query.expression = self.translate_deparom_query(query.expression) log.info( u"{backend_name}: searching documents, expression='{0}', offset={1}, limit={2}; user={username}" .format(query.expression, offset, limit, **self.__dict__)) starttime = timeit.default_timer() # Define search request URI # https://api.depa.tech/es/deparom/_search?q=AB:cloud-computing uri = self.uri + self.path_search # Define search request parameters # 'family.simple': True, params = { 'q': query.expression, #'fq': query.filter, #'sort': 'pd desc, ucid asc', #'fl': 'ucid,fam', 'from': offset, 'size': limit, } log.info( u'{backend_name}: query={query}, uri={uri}, params={params}, options={options}' .format(query=query, uri=uri, params=params, options=options.dump(), backend_name=self.backend_name)) # Perform search request headers = {} headers.update({'Accept': 'application/json'}) try: if transport == 'querystring': response = requests.get(uri, params=params, headers=headers, auth=(self.username, self.password), verify=self.tls_verify) else: response = requests.post(uri, data=query.expression, headers=headers, auth=(self.username, self.password), verify=self.tls_verify) except RequestException as ex: raise self.search_failed( ex=ex, user_info= 'Error or timeout while connecting to upstream database. Database might be offline.', meta={ 'username': self.username, 'uri': uri }) duration = timeit.default_timer() - starttime # Process search response if response.status_code == 200: #print "response:", response.content # debugging response_data = json.loads(response.content) if True: # Debugging: Simulate error #response_data['content']['error'] = {'code': 503, 'msg': 'no servers hosting shard'} # Mogrify search response # TODO: Generalize between all search backends sr = DepaTechSearchResponse(response_data, options=options) result = sr.render() duration = round(duration, 1) # TODO: Unify between IFI CLAIMS and depa.tech log.info( '{backend_name}: Search succeeded. duration={duration}s, meta=\n{meta}' .format(duration=duration, meta=result['meta'].prettify(), **self.__dict__)) if not result['numbers']: log.warn( '{backend_name}: Search had empty results. duration={duration}s, meta=\n{meta}' .format(duration=duration, meta=result['meta'].prettify(), **self.__dict__)) return result #elif response_data['status'] == 'error': # raise self.search_failed(response_data['message'], response=response) else: raise self.search_failed('Search response could not be parsed', response=response) elif response.status_code in [400, 500] and response.headers.get( 'Content-Type', '').startswith('application/json'): response_data = json.loads(response.content) # Handle search expression errors if 'error' in response_data: upstream_error = response_data['error']['caused_by'] upstream_error['code'] = response_data['status'] if 'reason' not in upstream_error: upstream_error['reason'] = 'Reason unknown' message = u'Response status code: {code}\n\n{reason}'.format( **upstream_error) raise self.search_failed( user_info=u'Error searching depa.tech.', message=message, response=response) raise self.search_failed(response=response)
def sip_published_data_search_handler(request): """Search for published-data at SIP""" #request.errors.add('sip-search', 'login', "SIP data source disabled, please use alternative data source.") #return # XML query expression query = request.params.get('expression', '') log.info('Raw query: ' + query) # fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted if should_be_quoted(query): query = '"%s"' % query #propagate_keywords(request, query_object) # lazy-fetch more entries up to maximum of SIP # TODO: get from patzilla.access.sip limit = 250 offset_local = int(request.params.get('range_begin', 1)) offset_remote = int(offset_local / limit) * limit # Compute query options, like # - limit # - sorting # - whether to remove family members # - whether to return all family members options = SmartBunch() options.update({ 'limit': limit, 'offset': offset_remote, }) # Propagate request parameters to search options parameters request_to_options(request, options) # currently not handled by search handler, it's already handled on xml expression builder level #if asbool(request.params.get('query_data[modifiers][family-full]')): # options.update({'feature_family_full': True}) try: data = sip_published_data_search(query, options) #print ' SIPsearch response:'; print data.prettify() # debugging return data except LoginException as ex: request.errors.add('sip-search', 'login', ex.sip_info) except SyntaxError as ex: request.errors.add('sip-search', 'expression', str(ex.msg)) log.warn(request.errors) except SearchException as ex: message = ex.get_message() request.errors.add('sip-search', 'search', message) log.error(request.errors) except NoResultsException as ex: # Forward response to let the frontend recognize zero hits request.response.status = HTTPNotFound.code return ex.data except OperationFailure as ex: message = unicode(ex) message = re.sub(u'namespace: .*', u'', message) request.errors.add('sip-search', 'internals', message) log.error(request.errors)
def crawl(self, constituents, expression, chunksize): if constituents not in ['pub-number', 'biblio']: raise ValueError( 'constituents "{0}" invalid or not implemented yet'.format( constituents)) real_constituents = constituents if constituents == 'pub-number': constituents = '' # fetch first chunk (1-chunksize) from upstream #first_chunk = self.search(expression, 0, chunksize) first_chunk = self.search_method( expression, SmartBunch({ 'offset': 0, 'limit': chunksize })) #print first_chunk #total_count = int(first_chunk['meta'].get('pager', {}).get('totalEntries', 0)) count_total = first_chunk.meta.navigator.count_total log.info(self.lm('Crawl count_total: {}'.format(count_total))) # Limit maximum size count_total = min(count_total, self.crawl_max_count) """ # SIP: pointer_total_count = JsonPointer('/meta/MemCount') total_count = int(pointer_total_count.resolve(first_chunk)) log.info('SipClient.crawl total_count: %s', total_count) # Limit maximum size # TODO: make configurable, put into instance variable count_total = min(count_total, 5000) """ # collect upstream results begin_second_chunk = chunksize chunks = [first_chunk] log.info( self.lm( 'Crawling {count_total} items with {chunksize} per request'. format(count_total=count_total, chunksize=chunksize))) for offset in range(begin_second_chunk, count_total, chunksize): # Don't hammer the upstream data source time.sleep(1) log.info( self.lm('Crawling from offset {offset}'.format(offset=offset))) chunk = self.search_method( expression, SmartBunch({ 'offset': offset, 'limit': chunksize })) chunks.append(chunk) # Merge chunks into single result all_numbers = [] all_details = [] # TODO: summarize elapsed_time for chunk in chunks: #print 'chunk:', chunk all_numbers += chunk['numbers'] all_details += chunk['details'] # Report about result result_count = len(all_details) log.info( self.lm('Crawling finished. result count: {result_count}'.format( result_count=result_count))) # Bulk response response = None if real_constituents == 'pub-number': response = first_chunk response['meta'] = { 'Success': 'true', 'MemCount': str(len(all_numbers)) } response['numbers'] = all_numbers del response['details'] elif real_constituents == 'biblio': response = first_chunk #print 'all_details:', all_details response['meta'] = { 'Success': 'true', 'MemCount': str(len(all_numbers)) } response['details'] = all_details #del response['details'] if not response: raise ValueError( 'constituents "{0}" invalid or not implemented yet'.format( constituents)) return response