예제 #1
0
    def pair_to_solr(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'


        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':
            # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible
            # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS
            value = normalize_patent(value, for_ops=False)

        elif key == 'pubdate':

            """
            - pd:[19800101 TO 19851231]
            - pd:[* TO 19601231]
            - pdyear:[1980 TO 1985]
            - pdyear:[* TO 1960]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    fieldname = 'pdyear'
                    parsed = True

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)
                    elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()])
                    if elements_are_years:
                        fieldname = 'pdyear'

                    else:
                        if within_dates['startdate']:
                            within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD')

                        if within_dates['enddate']:
                            within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD')

                    if not within_dates['startdate']:
                        within_dates['startdate'] = '*'

                    if not within_dates['enddate']:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex)
                logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = ifi_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to IFI format
                rewrite_classes_ifi(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'}


        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value:
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            if key == 'fulltext' and '{!complexphrase' in value:
                expression = value
            else:
                expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}
예제 #2
0
    def pair_to_elasticsearch(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'

        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':

            # Transform into distinct fields PC, DE, KI

            #if has_booleans(value):
            #    value = '({})'.format(value)

            expression_parts = []

            # Publication number
            patent = split_patent_number(value)

            patent_normalized = normalize_patent(patent, for_ops=False)
            if patent_normalized:
                patent = patent_normalized

            if patent:
                subexpression = u'PC:{country} AND DE:{number}'.format(
                    **patent)
                if patent['kind']:
                    subexpression += u' AND KI:{kind}'.format(**patent)
                expression_parts.append(u'({})'.format(subexpression))

            # Application number
            subexpression = u'AN:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

            # Priority number
            subexpression = u'NP:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

        elif key == 'pubdate':
            """
            - DP:[19800101 TO 19851231]
            - DP:[* TO 19601231]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    value = u'within {}0101,{}1231'.format(value, value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)

                    if within_dates['startdate']:
                        if len(within_dates['startdate']) == 4:
                            within_dates['startdate'] += '0101'
                        within_dates['startdate'] = parse_date_universal(
                            within_dates['startdate']).format('YYYYMMDD')
                    else:
                        within_dates['startdate'] = '*'

                    if within_dates['enddate']:
                        if len(within_dates['enddate']) == 4:
                            within_dates['enddate'] += '1231'
                        within_dates['enddate'] = parse_date_universal(
                            within_dates['enddate']).format('YYYYMMDD')
                    else:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(
                        fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = lucene_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to Lucene format
                rewrite_classes_lucene(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'country':
            value = value.upper()

        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value):
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}