Exemplo n.º 1
0
def depatech_published_data_crawl_handler(request):
    """Crawl published-data at MTC depa.tech"""

    # Get hold of query expression and filter
    query = SmartBunch({
        'expression': request.params.get('expression', ''),
        'filter':     request.params.get('filter', ''),
        })
    log.info('query: {}'.format(query))

    if should_be_quoted(query.expression):
        query.expression = '"%s"' % query.expression

    # constituents: abstract, biblio and/or full-cycle
    constituents = request.matchdict.get('constituents', 'full-cycle')
    #print 'constituents:', constituents

    chunksize = int(request.params.get('chunksize', '5000'))

    try:
        result = depatech_crawl(constituents, query, chunksize)
        return result

    except Exception as ex:
        request.errors.add('depatech-crawl', 'crawl', str(ex))
        log.error(request.errors)
        log.error('query="{0}", exception:\n{1}'.format(query, _exception_traceback()))
Exemplo n.º 2
0
def issue_reporter_handler(request):

    targets = request.params.get('targets')

    report_data = request.json
    report_data.setdefault('application', {})
    report = SmartBunch.bunchify(report_data)

    # Add user information to issue report
    user = request.user
    if user:

        # Anonymize sensitive user data
        user.password = None
        user.upstream_credentials = None

        # Serialize user object and attach to report
        report.application.user = SmartBunch(json.loads(user.to_json()))

    # Send the whole beast to the standard application log
    log.error('Issue report [{targets}]:\n{report}'.format(
        report=report.pretty(),
        targets=targets
    ))

    # TODO: Store the issue report into database
    # TODO: What about other targets like "log:error", "log:warning", "human:support", "human:user"?

    # Send email report
    for target in read_list(targets):
        if target.startswith('email:'):
            recipient = target.replace('email:', '')
            email_issue_report(report, recipient)
Exemplo n.º 3
0
    def get_email_settings(self, vendor):
        """
        Read default/global email settings and
        update with per-vendor email settings.
        """

        # Container for email settings
        email_settings = SmartBunch({
            'addressbook': [],
            'content': SmartBunch(),
        })

        for setting_name in ['addressbook', 'content']:
            setting_key = 'email_{}'.format(setting_name)
            defaults = self.application_settings.get(setting_key)
            specific = self.application_settings.get(setting_key + ':' +
                                                     vendor)

            thing = deepcopy(defaults)
            if defaults and specific:
                thing.update(deepcopy(specific))

            for key, value in thing.items():
                thing[key] = value.decode('utf-8')

            email_settings[setting_name] = thing

        return email_settings
Exemplo n.º 4
0
    def read(self):

        # Read metadata
        """
        out:
        "meta": {
            "status": "success",
            "params": {
                "sort": "pd desc, ucid asc",
                "rows": "250",
                "indent": "true",
                "qt": "premium",
                "timeAllowed": "300000",
                "q": "text:vibrat* AND (ic:G01F000184 OR cpc:G01F000184)",
                "start": "0",
                "wt": "json",
                "fl": "ucid,fam"
            },
            "pager": {
                "totalEntries": 6872,
                "entriesOnThisPage": 250,
                "firstPage": 1,
                "lastPage": 28,
                "previousPage": null,
                "currentPage": 1,
                "entriesPerPage": "250",
                "nextPage": 2
            },
            "name": "ifi",
            "time": "4.836163"
        }
        """
        self.meta.upstream.update({
            'name':
            'ifi',
            'time':
            self.input['time'],
            'status':
            self.input['status'],
            'params':
            SmartBunch.bunchify(
                self.input['content']['responseHeader']['params']),
            'pager':
            SmartBunch.bunchify(self.input['content']['responseHeader'].get(
                'pager', {})),
        })

        self.meta.navigator.count_total = int(
            self.meta.upstream.pager.totalEntries)
        self.meta.navigator.count_page = int(
            self.meta.upstream.pager.entriesOnThisPage)
        self.meta.navigator.offset = int(self.meta.upstream.params.start)
        self.meta.navigator.limit = int(self.meta.upstream.params.rows)
        self.meta.navigator.postprocess = SmartBunch()

        # Read content
        self.documents = self.input['content']['response']['docs']
        self.read_documents()
Exemplo n.º 5
0
    def get_datasource_settings(self):

        # Container for datasource settings
        datasource_settings = SmartBunch({
            'datasources': [],
            'datasource': SmartBunch(),
            'total': SmartBunch.bunchify({'fulltext_countries': [], 'details_countries': []}),
        })

        # Read datasource settings from configuration
        datasource_settings.datasources = read_list(self.application_settings.get('ip_navigator', {}).get('datasources'))
        datasource_settings.protected_fields = read_list(self.application_settings.get('ip_navigator', {}).get('datasources_protected_fields'))
        for datasource in datasource_settings.datasources:
            settings_key = 'datasource:{name}'.format(name=datasource)
            datasource_info = self.application_settings.get(settings_key, {})
            datasource_info['fulltext_enabled'] = asbool(datasource_info.get('fulltext_enabled', False))
            datasource_info['fulltext_countries'] = read_list(datasource_info.get('fulltext_countries', ''))
            datasource_info['details_enabled'] = asbool(datasource_info.get('details_enabled', False))
            datasource_info['details_countries'] = read_list(datasource_info.get('details_countries', ''))
            datasource_settings.datasource[datasource] = SmartBunch.bunchify(datasource_info)

            # Aggregate data for all countries
            datasource_settings.total.fulltext_countries += datasource_info['fulltext_countries']

        return datasource_settings
Exemplo n.º 6
0
    def __init__(self, input, options=None):

        # Input data and options
        self.input = input
        self.options = options and SmartBunch.bunchify(options) or SmartBunch()

        # Setup data structures
        self.setup()

        # Read input information
        self.read()

        # Run data munging actions
        if 'feature_family_remove' in self.options and self.options.feature_family_remove:
            self.remove_family_members()
Exemplo n.º 7
0
def ificlaims_client(options=None):
    options = options or SmartBunch()
    if 'vendor' in options and options.vendor == 'serviva':
        client = get_serviva_client()
    else:
        client = get_ificlaims_client()
    return client
Exemplo n.º 8
0
def status_upstream_ificlaims_handler(request):
    client = ificlaims_client()
    query = SmartBunch({
        'expression': 'pn:EP0666666',
    })
    data = client.search_real(query)
    assert data, 'Empty response from IFI CLAIMS'
    return "OK"
Exemplo n.º 9
0
def status_upstream_depatech_handler(request):
    client = get_depatech_client()
    query = SmartBunch({
        'expression': '(PC:DE AND DE:212016000074 AND KI:U1) OR AN:DE212016000074U1 OR NP:DE212016000074U1',
    })
    data = client.search_real(query)
    assert data, 'Empty response from MTC depa.tech'
    return "OK"
Exemplo n.º 10
0
    def setup(self):

        # Documents from upstream data source
        self.documents = []

        # Metadata information, upstream (raw) and downstream (unified)
        self.meta = SmartBunch.bunchify({
            'navigator': {},
            'upstream': {},
        })

        # Output information, upstream (raw) and downstream (unified)
        self.output = SmartBunch.bunchify({
            'meta': {},
            'numbers': [],
            'details': [],
            'navigator': {},
        })
Exemplo n.º 11
0
def make_request(client):

    #results = client.search('*:*')
    #pprint(results)

    #results = client.search('pa:siemens', 0, 10)
    #results = client.search('pa:siemens OR pa:bosch', 0, 10)
    #results = client.search('pa:(siemens OR bosch)', 0, 10)
    #results = client.search('text:"solar energy"', 0, 10)
    #results = client.search(SmartBunch({'expression': 'text:solar energy'}), SmartBunch({'offset': 0, 'limit': 10}))
    results = client.search(
        SmartBunch(
            {'expression': '{!complexphrase inOrder=true}"siemen* *haus"'}),
        SmartBunch({
            'offset': 0,
            'limit': 10
        }))
    #results = client.search(u'text:抑血管生成素的药物用途', 0, 10)
    #results = client.search(u'text:放射線を照射する放射線源と', 0, 10)
    #results = client.search(SmartBunch({'expression': 'pnctry:(de OR ep OR wo OR cn OR jp OR tw) AND pa:"taiwan paiho" AND pd:[20170101 TO 20170731]'}), SmartBunch({'offset': 0, 'limit': 50}))
    print json.dumps(results)
Exemplo n.º 12
0
    def get_vendor_settings(self):

        # Container for vendor settings
        vendor_settings = SmartBunch({
            'vendors': [],
            'vendor': SmartBunch(),
        })

        # Read vendor settings from configuration
        try:
            vendor_settings.vendors = read_list(self.application_settings.ip_navigator.vendors)
            assert vendor_settings.vendors
        except:
            raise ConfigurationError('No vendor configured in "{configfile}"'.format(configfile=self.configfile))

        for vendor in vendor_settings.vendors:

            settings_key = 'vendor:{name}'.format(name=vendor)
            if settings_key not in self.application_settings:
                raise ConfigurationError('Vendor "{vendor}" not configured in "{configfile}"'.format(
                    vendor=vendor, configfile=self.configfile))

            vendor_info = self.application_settings.get(settings_key, {})
            for key, value in vendor_info.iteritems():
                vendor_info[key] = value.decode('utf-8')

            if 'hostname_matches' in vendor_info:
                vendor_info.hostname_matches = read_list(vendor_info.hostname_matches)

            vendor_info.email = self.get_email_settings(vendor)

            vendor_settings.vendor[vendor] = SmartBunch.bunchify(vendor_info)

        return vendor_settings
Exemplo n.º 13
0
 def datasource_settings(self):
     """
     Return datasource settings while accounting for sensible settings like API URI and credentials.
     """
     request = get_current_request()
     datasource_settings = SmartBunch.bunchify(request.registry.datasource_settings)
     if 'protected_fields' in datasource_settings:
         for fieldname in datasource_settings.protected_fields:
             for name, settings in datasource_settings.datasource.iteritems():
                 if fieldname in settings:
                     del settings[fieldname]
         del datasource_settings['protected_fields']
     return datasource_settings
Exemplo n.º 14
0
def depatech_search(query, options=None):

    options = options or SmartBunch()

    client = get_depatech_client()
    try:
        data = client.search(query, options)
        # Raise an exception on empty results to skip caching this response
        if data.meta.navigator.count_total == 0:
            raise NoResultsException('No results', data=data)
        return data

    except SearchException as ex:
        client.stale = True
        raise
Exemplo n.º 15
0
    def remove_family_members(self):

        # Filtering mechanics: Deduplicate by family id
        seen = {}
        removed = []
        removed_map = defaultdict(list)
        stats = SmartBunch(removed=0)

        def family_remover(item):

            fam = self.document_to_family_id(item)

            # Sanity checks on family id
            # Do not remove documents without valid family id
            if not fam or fam in ['0', '-1']:
                return True

            # "Seen" filtering logic
            if fam in seen:
                stats.removed += 1
                removed.append(item)
                removed_map[fam].append(item)
                return False
            else:
                seen[fam] = True
                #print 'representative: {rep} [{fam}]'.format(rep=item['publication_number'], fam=fam)
                return True

        # Update metadata and content

        # 1. Apply family cleansing filter to main documents response
        self.documents = list(filter(family_remover, self.documents))
        #print 'removed_map:'; pprint(removed_map)

        # 2. Add list of removed family members to output
        self.output.navigator.family_members = {'removed': removed}
        #self.output['family-members-removed'] = removed

        # 3. Update metadata
        self.meta.navigator.postprocess.action = 'feature_family_remove'
        self.meta.navigator.postprocess.info = stats
Exemplo n.º 16
0
    def read(self):

        # Read metadata
        """
        in:
        "info": {
            "Info": "Search processed in 2905",
            "Success": "true",
            "ResultLength": 250,
            "FamCount": "1200",
            "DocCount": "5432",
            "MemCount": "3599",
            "Limit": 250,
            "Offset": 0,
            "ResultSetId": "4153687"
        },
        """
        self.meta.upstream.update(self.input['info'])
        self.meta.upstream.update({
            'name': 'sip',
            # TODO: Reference from IFI CLAIMS, fill up/unify.
            #'time': self.input['time'],
            #'status': self.input['status'],
            #'params': SmartBunch.bunchify(self.input['content']['responseHeader']['params']),
            #'pager': SmartBunch.bunchify(self.input['content']['responseHeader'].get('pager', {})),
        })

        self.meta.navigator.count_total = int(self.meta.upstream.MemCount)
        self.meta.navigator.count_page = len(self.input['results'])
        self.meta.navigator.offset = int(self.meta.upstream.Offset)
        self.meta.navigator.limit = int(self.meta.upstream.Limit)
        self.meta.navigator.postprocess = SmartBunch()

        # Read content
        """
        in:
        "results": [{
        }],
        """
        self.documents = self.input['results']
        self.read_documents()
Exemplo n.º 17
0
    def read(self):

        #print 'input:', self.input

        # Read metadata
        """
        input:
        {
            "_shards": {
                "failed": 0,
                "successful": 5,
                "total": 5
            },
            "hits": {
                "hits": [
                    {
                        "_id": "DE.000202013003344.U1",
                        "_index": "deparom",
                        "_score": 13.234067,
                        "_source": {
                            "AB": "<p num=\"0000\">Rettungsensemble (1) mit Seilklemmen (2, 3), dadurch gekennzeichnet, dass es folgende, miteinander über einen Seilzug (4) verbundene Komponenten umfasst: <br/>– eine erste Seilklemme (2) mit wenigstens einer Umlenkrolle (21), zum verschieblichen Fixieren des Körpers des Benutzers an einem Seil; <br/>– eine zweite Seilklemme (3) mit wenigstens einer Umlenkrolle (31), zur verschieblichen Befestigung an dem Seil oberhalb der Position der ersten Seilklemme (2); wobei <br/>– der Seilzug (4) mit seinem einen Ende an der zweiten Seilklemme (3) oder einer ihrer Umlenkrollen (31) befestigt und in die Umlenkrollen der ersten und zweiten Seilklemme eingelegt ist, diese miteinander verbindet und zusammen mit diesen einen Flaschenzug bildet, und dessen anderes Ende frei hängt und zur Bedienung des Flaschenzugs vorgesehen ist.</p><p num=\"\"><de-figure num=\"0\"></de-figure></p>",
                            "AD": "20130410",
                            "AN": "DE202013003344",
                            "DE": "202013003344",
                            "DP": "20131205",
                            "GT": "Rettungsensemble zur Bergung aus Gletscherspalten",
                            "IC": [
                                "A63B",
                                "A63B0029",
                                "A63B002900"
                            ],
                            "KI": "U1",
                            "MC": [
                                "A63B",
                                "A63B0029",
                                "A63B002900"
                            ],
                            "NP": "CH64912",
                            "PA": "Mammut Sports Group AG, Seon, CH",
                            "PC": "DE",
                            "PD": "20120509",
                            "RN": "Bogensberger Patent- & Markenbüro, Eschen, LI"
                        },
                        "_type": "DEP"
                    }
                ],
                "max_score": 13.234067,
                "total": 1
            },
            "timed_out": false,
            "took": 7
        }
        """
        self.meta.upstream.update({
            'name': 'depatech',
            'time': self.input['took'],
            'status': 'success',
            #'params': SmartBunch.bunchify(self.input['content']['responseHeader']['params']),
            #'pager': SmartBunch.bunchify(self.input['content']['responseHeader'].get('pager', {})),
        })

        self.meta.navigator.count_total = int(self.input['hits']['total'])
        #self.meta.navigator.count_page  = int(self.meta.upstream.pager.entriesOnThisPage)
        self.meta.navigator.offset = int(self.options.offset)
        self.meta.navigator.limit = int(self.options.limit)
        self.meta.navigator.max_hits = int(self.options.max_hits)
        self.meta.navigator.postprocess = SmartBunch()

        # Read content
        self.documents = self.input['hits']['hits']
        self.read_documents()
Exemplo n.º 18
0
    def search_real(self, query, options=None):

        query.setdefault('filter', '')

        options = options or SmartBunch()
        options.setdefault('offset', 0)
        options.setdefault('limit', self.pagesize)

        offset = options.offset
        limit = options.limit

        log.info(
            u"{backend_name}: searching documents, expression='{0}', offset={1}, limit={2}; user={username}"
            .format(query.expression, offset, limit, **self.__dict__))

        if not self.token or self.stale:
            self.login()

        starttime = timeit.default_timer()

        # Define search request URI
        # https://cdws.ificlaims.com/search/query?q=pa:facebook
        # https://cdws.ificlaims.com/search/query?q=*:*&fl=ucid&rows=1
        uri = self.uri + self.path_search

        # Define search request parameters
        # 'family.simple': True,
        params = {
            'q': query.expression,
            'fq': query.filter,
            'sort': 'pd desc, ucid asc',
            'fl': 'ucid,fam',
            'start': offset,
            'rows': limit,
        }

        log.info(
            u'IFI CLAIMS search. query={query}, uri={uri}, params={params}, options={options}'
            .format(query=query,
                    uri=uri,
                    params=params,
                    options=options.dump()))

        # Perform search request
        headers = self.get_authentication_headers()
        headers.update({'Accept': 'application/json'})
        try:
            response = requests.get(uri,
                                    params=params,
                                    headers=headers,
                                    verify=self.tls_verify)
        except RequestException as ex:
            self.logout()
            raise self.search_failed(
                ex=ex,
                user_info=
                'Error or timeout while connecting to upstream database. Database might be offline.',
                meta={
                    'username': self.username,
                    'uri': uri
                })
        duration = timeit.default_timer() - starttime

        #print "response:", response.content        # debugging

        # Process search response
        if response.status_code == 200:
            #print "response:", response.content        # debugging

            response_data = json.loads(response.content)
            if response_data['status'] == 'success':

                # Debugging: Simulate error
                #response_data['content']['error'] = {'code': 503, 'msg': 'no servers hosting shard'}

                # Handle search expression errors
                if 'error' in response_data['content']:
                    upstream_error = response_data['content']['error']

                    if 'msg' not in upstream_error:
                        upstream_error['msg'] = 'Reason unknown'

                    message = u'Response status code: {code}\n\n{msg}'.format(
                        **upstream_error)

                    # Enrich "maxClauseCount" message, e.g. raised by {!complexphrase}text:"auto* AND leucht*"~5
                    if upstream_error[
                            "code"] == 500 and u'maxClauseCount is set to' in upstream_error[
                                "msg"]:
                        raise self.search_failed(
                            user_info=
                            u'Too many terms in phrase expression, wildcard term prefixes might by too short.',
                            message=message,
                            response=response)

                    # Enrich "no servers hosting shard" message
                    elif upstream_error["code"] == 503 and \
                        (
                            u'no servers hosting shard' in upstream_error["msg"] or \
                            u'No server is available' in upstream_error["msg"]
                        ):
                        raise self.search_failed(
                            user_info=
                            u'Error while connecting to upstream database. Database might be offline.',
                            message=message,
                            response=response)

                    # Regular traceback
                    elif upstream_error[
                            "code"] == 500 and 'trace' in upstream_error:
                        message = u'Response status code: {code}\n\n{trace}'.format(
                            **upstream_error)
                        raise self.search_failed(
                            user_info=u'Unknown exception at search backend',
                            message=message,
                            response=response)

                    # Enrich "SyntaxError" exception
                    elif upstream_error[
                            "code"] == 400 and u'ParseException' in upstream_error[
                                "msg"]:
                        user_info = re.sub(
                            r'.*(Encountered.*at line.*?\.).*',
                            r'SyntaxError, can not parse query expression: \1',
                            upstream_error["msg"],
                            flags=re.DOTALL)
                        raise self.search_failed(user_info=user_info,
                                                 message=message,
                                                 response=response)

                    else:
                        raise self.search_failed(user_info=message,
                                                 response=response)

                # Mogrify search response
                # TODO: Generalize between all search backends
                sr = IFIClaimsSearchResponse(response_data, options=options)
                result = sr.render()
                duration = round(duration, 1)

                # TODO: Unify between IFI CLAIMS and SIP
                log.info(
                    '{backend_name}: Search succeeded. duration={duration}s, meta=\n{meta}'
                    .format(duration=duration,
                            meta=result['meta'],
                            **self.__dict__))

                if not result['numbers']:
                    log.warn(
                        '{backend_name}: Search had empty results. duration={duration}s, meta=\n{meta}'
                        .format(duration=duration,
                                meta=result['meta'],
                                **self.__dict__))

                return result

            elif response_data['status'] == 'error':

                user_info = None
                if response_data[
                        'message'] == 'JSON error: failed to read response object':
                    user_info = u'Error while connecting to upstream database. Database might be offline.'

                raise self.search_failed(user_info=user_info,
                                         message=response_data['message'],
                                         response=response)

            else:
                raise self.search_failed('Search response could not be parsed',
                                         response=response)

        else:
            # print "response:", response.content        # debugging

            self.logout()

            # Strip HTML from response body
            response_content = response.content
            if response.headers['Content-Type'].startswith('text/html'):
                response_content = re.sub('<[^<]+?>', '',
                                          response_content).strip().replace(
                                              '\r\n', ', ')

            # Build alternative basic error structure
            upstream_error = {
                'code': response.status_code,
                'reason': response.reason,
                'content': response_content,
            }

            message = json.dumps(upstream_error)

            raise self.search_failed(
                user_info=
                u'Error while connecting to upstream database. Database might be offline.',
                message=message,
                response=response)

        raise self.search_failed(response=response)
Exemplo n.º 19
0
    def get_datasource_settings(self, vendor=None):

        # Container for datasource settings.
        datasource_settings = SmartBunch({
            'datasources': [],
            'datasource': SmartBunch(),
            'total': SmartBunch.bunchify({'fulltext_countries': [], 'details_countries': []}),
        })

        # Read datasource settings from configuration.
        datasource_settings.datasources = read_list(self.application_settings.get('ip_navigator', {}).get('datasources'))
        datasource_settings.protected_fields = read_list(self.application_settings.get('ip_navigator', {}).get('datasources_protected_fields'))

        for datasource in datasource_settings.datasources:
            datasource_info = SmartBunch()
            if vendor is None:
                settings_key = 'datasource:{name}'.format(name=datasource)
            else:
                settings_key = 'datasource:{name}:{vendor}'.format(name=datasource, vendor=vendor)

            ds_settings = self.application_settings.get(settings_key, {})
            datasource_info.setdefault('fulltext_enabled', asbool(ds_settings.get('fulltext_enabled', False)))
            datasource_info.setdefault('fulltext_countries', read_list(ds_settings.get('fulltext_countries', '')))
            datasource_info.setdefault('details_enabled', asbool(ds_settings.get('details_enabled', False)))
            datasource_info.setdefault('details_countries', read_list(ds_settings.get('details_countries', '')))
            for key, value in ds_settings.iteritems():
                datasource_info.setdefault(key, value)

            datasource_settings.datasource[datasource] = SmartBunch.bunchify(datasource_info)

            # Aggregate data for all countries.
            datasource_settings.total.fulltext_countries += datasource_info['fulltext_countries']

        return datasource_settings
Exemplo n.º 20
0
def ificlaims_published_data_search_handler(request):
    """Search for published-data at IFI CLAIMS Direct"""

    # Get hold of query expression and filter
    query = SmartBunch({
        'expression': request.params.get('expression', ''),
        'filter': request.params.get('filter', ''),
    })
    log.info('Query: {}'.format(query))

    # Parse expression, extract and propagate keywords to user interface
    parser = IFIClaimsParser(query.expression)
    propagate_keywords(request, parser)

    # Fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted
    if should_be_quoted(query.expression):
        query.expression = '"%s"' % query.expression

    # Lazy-fetch more entries
    # TODO: get from patzilla.access.ificlaims
    limit = 250
    offset_local = int(request.params.get('range_begin', 0))
    offset_remote = int(offset_local / limit) * limit

    # Compute query options, like
    # - limit
    # - sorting
    # - whether to remove family members
    options = SmartBunch()
    options.update({
        'limit': limit,
        'offset': offset_remote,
    })

    # Propagate request parameters to search options parameters
    request_to_options(request, options)

    try:
        data = ificlaims_search(query, options)
        #print data.prettify()      # debugging
        return data

    except LoginException as ex:
        request.errors.add('ificlaims-search', 'login', ex.details)
        log.warn(request.errors)

    except SyntaxError as ex:
        request.errors.add('ificlaims-search', 'expression', unicode(ex.msg))
        log.warn(request.errors)

    except SearchException as ex:
        message = ex.get_message()
        request.errors.add('ificlaims-search', 'search', message)
        log.warn(request.errors)

    except NoResultsException as ex:
        # Forward response to let the frontend recognize zero hits
        request.response.status = HTTPNotFound.code
        return ex.data

    except OperationFailure as ex:
        message = unicode(ex)
        request.errors.add('ificlaims-search', 'internals', message)
        log.error(request.errors)

    except Exception as ex:
        message = handle_generic_exception(request, ex, 'ificlaims-search',
                                           query)
        request.errors.add('ificlaims-search', 'search', message)
Exemplo n.º 21
0
def depatech_published_data_search_handler(request):
    """Search for published-data at MTC depa.tech"""

    # Get hold of query expression and filter
    expression = request.params.get('expression', '')
    filter = request.params.get('filter', '')
    query = SmartBunch({
        'syntax':     'lucene',
        'expression': expression,
        'filter':     filter,
    })
    if expression.startswith('DEPAROM V1.0') or expression.startswith('deparom:'):
        query.syntax = 'deparom'

    log.info('Query: {}'.format(query))

    # Parse expression, extract and propagate keywords to user interface
    if query.syntax == 'lucene':
        parser = DepaTechParser(query.expression)
        keywords_to_response(request, parser)

    # TODO: Parse DEPAROM query expression and extract keywords

    # Fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted
    if should_be_quoted(query.expression):
        query.expression = '"%s"' % query.expression

    # Lazy-fetch more entries
    # TODO: get from patzilla.access.depatech
    limit = 250
    offset_local = int(request.params.get('range_begin', 0))
    offset_remote = int(offset_local / limit) * limit

    # Compute query options, like
    # - limit
    # - sorting
    # - whether to remove family members
    options = SmartBunch()
    options.update({
        'limit': limit,
        'offset': offset_remote,
    })

    # Propagate request parameters to search options parameters
    request_to_options(request, options)

    try:
        data = depatech_search(query, options)
        #print data.prettify()      # debugging
        return data

    except LoginException as ex:
        request.errors.add('depatech-search', 'login', ex.details)
        log.warn(request.errors)

    except SyntaxError as ex:
        request.errors.add('depatech-search', 'expression', str(ex.msg))
        log.warn(request.errors)

    except SearchException as ex:
        message = ex.get_message()
        request.errors.add('depatech-search', 'search', message)
        log.warn(request.errors)

    except NoResultsException as ex:
        # Forward response to let the frontend recognize zero hits
        request.response.status = HTTPNotFound.code
        return ex.data

    except OperationFailure as ex:
        message = str(ex)
        request.errors.add('depatech-search', 'internals', message)
        log.error(request.errors)

    except Exception as ex:
        message = handle_generic_exception(request, ex, 'depatech-search', query)
        request.errors.add('depatech-search', 'search', message)
Exemplo n.º 22
0
    def search(self, expression, options=None):

        options = options or SmartBunch()

        options.setdefault('offset', 0)
        options.setdefault('limit', self.pagesize)

        offset = options.offset
        limit = options.limit

        log.info(
            "{backend_name}: searching documents, expression='{0}', offset={1}, limit={2}"
            .format(expression, offset, limit, **self.__dict__))

        if not self.sessionid or self.stale:
            self.login()

        starttime = timeit.default_timer()
        try:
            response = requests.post(self.uri + '/search/new',
                                     data={
                                         'session': self.sessionid,
                                         'searchtree': expression
                                     })
        except (ConnectionError, ConnectTimeout) as ex:
            log.error(
                'SIP search for user "{username}" at "{uri}" failed. Reason: {0} {1}.'
                .format(ex.__class__,
                        ex.message,
                        username=self.username,
                        uri=self.uri))
            self.logout()
            raise SearchException(
                ex.message,
                sip_info=
                'Error or timeout while connecting to upstream database. Database might be offline.'
            )

        # Process search response
        if response.status_code == 200:
            #print "SIP search response (raw)"; print response.content        # debugging
            try:
                search_response = self._search_parse_xml(response.content)

                if search_response['success'] == 'false':
                    raise SearchException(
                        'Search failed',
                        sip_response=search_response['response'])

                if 'ResultSetId' in search_response['data']:

                    search_info = search_response['data']
                    ResultSetId = search_info['ResultSetId']

                    # Inject offset and limit into metadata, pretend it comes from server
                    search_info['Offset'] = offset
                    search_info['Limit'] = limit

                    # perform second request to actually retrieve the results by ResultSetId
                    search_results = self.getresults(ResultSetId, options)
                    #print "SIP search results:", search_results

                    duration = timeit.default_timer() - starttime
                    log.info(
                        'Search succeeded. duration={0}s, search_info={1}'.
                        format(round(duration, 1), search_info))

                    upstream_response = {
                        'info': search_info,
                        'results': search_results or [],
                    }

                    # Mogrify search response
                    # TODO: Generalize between all search backends
                    sr = SipSearchResponse(upstream_response, options=options)
                    result = sr.render()
                    duration = round(duration, 1)

                    # TODO: Unify between SIP and IFI CLAIMS
                    log.info(
                        '{backend_name}: Search succeeded. duration={duration}s, meta=\n{meta}'
                        .format(duration=duration,
                                meta=result['meta'].prettify(),
                                **self.__dict__))

                    if not result['numbers']:
                        log.warn(
                            '{backend_name} search from "{user}" for "{expression}" had empty results.'
                            .format(user=self.username,
                                    expression=expression,
                                    **self.__dict__))

                    return result

                else:
                    message = 'Search failed. Reason: Upstream response lacks valid ResultSetId. content={0}'.format(
                        response.text)
                    raise SearchException(
                        message,
                        sip_info=
                        'Search failed. Search response could not be parsed.')

            except Exception as ex:
                log.error(
                    'Search failed. {name}: {message}. expression={expression}, response={response}'
                    .format(name=ex.__class__.__name__,
                            message=ex.message,
                            response=response.text,
                            expression=expression))
                raise

        else:
            response_status = str(response.status_code) + ' ' + response.reason
            message = 'SIP search failed. Reason: response status != 200. status={0}, content={1}'.format(
                response_status, response.text)
            log.error(message)
            raise SearchException(
                message,
                sip_info=
                'HTTP error "{status}" while searching upstream database'.
                format(status=response_status))
Exemplo n.º 23
0
    def search_real(self, query, options=None):
        options = options or SmartBunch()

        options.setdefault('offset', 0)
        options.setdefault('limit', self.pagesize)
        options.setdefault('max_hits', self.search_max_hits)

        offset = options.offset
        limit = options.limit
        transport = 'querystring'

        # Use DEPAROM Query Translator
        # https://depa.tech/api/manual/dqt-translator/
        # https://api.depa.tech/dqt/query/es
        if query.expression and query.syntax == 'deparom':
            transport = 'json'
            query.expression = self.translate_deparom_query(query.expression)

        log.info(
            u"{backend_name}: searching documents, expression='{0}', offset={1}, limit={2}; user={username}"
            .format(query.expression, offset, limit, **self.__dict__))

        starttime = timeit.default_timer()

        # Define search request URI
        # https://api.depa.tech/es/deparom/_search?q=AB:cloud-computing
        uri = self.uri + self.path_search

        # Define search request parameters
        # 'family.simple': True,
        params = {
            'q': query.expression,
            #'fq': query.filter,
            #'sort': 'pd desc, ucid asc',
            #'fl': 'ucid,fam',
            'from': offset,
            'size': limit,
        }

        log.info(
            u'{backend_name}: query={query}, uri={uri}, params={params}, options={options}'
            .format(query=query,
                    uri=uri,
                    params=params,
                    options=options.dump(),
                    backend_name=self.backend_name))

        # Perform search request
        headers = {}
        headers.update({'Accept': 'application/json'})
        try:
            if transport == 'querystring':
                response = requests.get(uri,
                                        params=params,
                                        headers=headers,
                                        auth=(self.username, self.password),
                                        verify=self.tls_verify)
            else:
                response = requests.post(uri,
                                         data=query.expression,
                                         headers=headers,
                                         auth=(self.username, self.password),
                                         verify=self.tls_verify)
        except RequestException as ex:
            raise self.search_failed(
                ex=ex,
                user_info=
                'Error or timeout while connecting to upstream database. Database might be offline.',
                meta={
                    'username': self.username,
                    'uri': uri
                })
        duration = timeit.default_timer() - starttime

        # Process search response
        if response.status_code == 200:
            #print "response:", response.content        # debugging

            response_data = json.loads(response.content)
            if True:

                # Debugging: Simulate error
                #response_data['content']['error'] = {'code': 503, 'msg': 'no servers hosting shard'}

                # Mogrify search response
                # TODO: Generalize between all search backends
                sr = DepaTechSearchResponse(response_data, options=options)
                result = sr.render()
                duration = round(duration, 1)

                # TODO: Unify between IFI CLAIMS and depa.tech
                log.info(
                    '{backend_name}: Search succeeded. duration={duration}s, meta=\n{meta}'
                    .format(duration=duration,
                            meta=result['meta'].prettify(),
                            **self.__dict__))

                if not result['numbers']:
                    log.warn(
                        '{backend_name}: Search had empty results. duration={duration}s, meta=\n{meta}'
                        .format(duration=duration,
                                meta=result['meta'].prettify(),
                                **self.__dict__))

                return result

            #elif response_data['status'] == 'error':
            #    raise self.search_failed(response_data['message'], response=response)

            else:
                raise self.search_failed('Search response could not be parsed',
                                         response=response)

        elif response.status_code in [400, 500] and response.headers.get(
                'Content-Type', '').startswith('application/json'):

            response_data = json.loads(response.content)

            # Handle search expression errors
            if 'error' in response_data:
                upstream_error = response_data['error']['caused_by']
                upstream_error['code'] = response_data['status']

                if 'reason' not in upstream_error:
                    upstream_error['reason'] = 'Reason unknown'

                message = u'Response status code: {code}\n\n{reason}'.format(
                    **upstream_error)

                raise self.search_failed(
                    user_info=u'Error searching depa.tech.',
                    message=message,
                    response=response)

        raise self.search_failed(response=response)
Exemplo n.º 24
0
def sip_published_data_search_handler(request):
    """Search for published-data at SIP"""

    #request.errors.add('sip-search', 'login', "SIP data source disabled, please use alternative data source.")
    #return

    # XML query expression
    query = request.params.get('expression', '')
    log.info('Raw query: ' + query)

    # fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted
    if should_be_quoted(query):
        query = '"%s"' % query

    #propagate_keywords(request, query_object)

    # lazy-fetch more entries up to maximum of SIP
    # TODO: get from patzilla.access.sip
    limit = 250
    offset_local = int(request.params.get('range_begin', 1))
    offset_remote = int(offset_local / limit) * limit

    # Compute query options, like
    # - limit
    # - sorting
    # - whether to remove family members
    # - whether to return all family members
    options = SmartBunch()
    options.update({
        'limit':  limit,
        'offset': offset_remote,
    })

    # Propagate request parameters to search options parameters
    request_to_options(request, options)

    # currently not handled by search handler, it's already handled on xml expression builder level
    #if asbool(request.params.get('query_data[modifiers][family-full]')):
    #    options.update({'feature_family_full': True})

    try:
        data = sip_published_data_search(query, options)
        #print ' SIPsearch response:'; print data.prettify()      # debugging
        return data

    except LoginException as ex:
        request.errors.add('sip-search', 'login', ex.sip_info)

    except SyntaxError as ex:
        request.errors.add('sip-search', 'expression', str(ex.msg))
        log.warn(request.errors)

    except SearchException as ex:
        message = ex.get_message()
        request.errors.add('sip-search', 'search', message)
        log.error(request.errors)

    except NoResultsException as ex:
        # Forward response to let the frontend recognize zero hits
        request.response.status = HTTPNotFound.code
        return ex.data

    except OperationFailure as ex:
        message = unicode(ex)
        message = re.sub(u'namespace: .*', u'', message)
        request.errors.add('sip-search', 'internals', message)
        log.error(request.errors)
Exemplo n.º 25
0
    def crawl(self, constituents, expression, chunksize):

        if constituents not in ['pub-number', 'biblio']:
            raise ValueError(
                'constituents "{0}" invalid or not implemented yet'.format(
                    constituents))

        real_constituents = constituents
        if constituents == 'pub-number':
            constituents = ''

        # fetch first chunk (1-chunksize) from upstream
        #first_chunk = self.search(expression, 0, chunksize)
        first_chunk = self.search_method(
            expression, SmartBunch({
                'offset': 0,
                'limit': chunksize
            }))
        #print first_chunk

        #total_count = int(first_chunk['meta'].get('pager', {}).get('totalEntries', 0))
        count_total = first_chunk.meta.navigator.count_total
        log.info(self.lm('Crawl count_total: {}'.format(count_total)))

        # Limit maximum size
        count_total = min(count_total, self.crawl_max_count)
        """
        # SIP:
        pointer_total_count = JsonPointer('/meta/MemCount')
        total_count = int(pointer_total_count.resolve(first_chunk))
        log.info('SipClient.crawl total_count: %s', total_count)

        # Limit maximum size
        # TODO: make configurable, put into instance variable
        count_total = min(count_total, 5000)
        """

        # collect upstream results
        begin_second_chunk = chunksize
        chunks = [first_chunk]
        log.info(
            self.lm(
                'Crawling {count_total} items with {chunksize} per request'.
                format(count_total=count_total, chunksize=chunksize)))
        for offset in range(begin_second_chunk, count_total, chunksize):

            # Don't hammer the upstream data source
            time.sleep(1)

            log.info(
                self.lm('Crawling from offset {offset}'.format(offset=offset)))
            chunk = self.search_method(
                expression, SmartBunch({
                    'offset': offset,
                    'limit': chunksize
                }))
            chunks.append(chunk)

        # Merge chunks into single result
        all_numbers = []
        all_details = []
        # TODO: summarize elapsed_time
        for chunk in chunks:
            #print 'chunk:', chunk
            all_numbers += chunk['numbers']
            all_details += chunk['details']

        # Report about result
        result_count = len(all_details)
        log.info(
            self.lm('Crawling finished. result count: {result_count}'.format(
                result_count=result_count)))

        # Bulk response
        response = None
        if real_constituents == 'pub-number':
            response = first_chunk
            response['meta'] = {
                'Success': 'true',
                'MemCount': str(len(all_numbers))
            }
            response['numbers'] = all_numbers
            del response['details']

        elif real_constituents == 'biblio':
            response = first_chunk
            #print 'all_details:', all_details
            response['meta'] = {
                'Success': 'true',
                'MemCount': str(len(all_numbers))
            }
            response['details'] = all_details
            #del response['details']

        if not response:
            raise ValueError(
                'constituents "{0}" invalid or not implemented yet'.format(
                    constituents))

        return response