Exemplo n.º 1
0
def results_swap_family_members(response):

    #pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference')
    #entries = pointer_results.resolve(results)

    publication_numbers = []

    # DE, EP..B, WO, EP..A2, EP..A3, EP, US
    priorities = [
        {'filter': lambda patent: patent.country.startswith('DE') and not patent.kind.startswith('D1')},
        {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('B')},
        {'filter': 'WO'},
        {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('A')},
        {'filter': 'EP'},
        {'filter': 'US'},
    ]

    def match_filter(item, filter):
        if callable(filter):
            patent = split_patent_number(item)
            outcome = filter(patent)
        else:
            outcome = item.startswith(filter)
        return outcome

    pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents')
    pointer_publication_reference = JsonPointer('/bibliographic-data/publication-reference/document-id')
    #pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id')

    # A.1 compute distinct list with unique families
    family_representatives = {}
    chunks = to_list(pointer_results.resolve(response))
    all_results = []
    for chunk in chunks:

        #print 'chunk:', chunk

        # Prepare list of document cycles
        #chunk_results = to_list(pointer_publication_reference.resolve(chunk))
        cycles = to_list(chunk['exchange-document'])

        # Publication number of first cycle in EPODOC format
        representation = cycles[0]
        pubref = pointer_publication_reference.resolve(representation)
        representation_pubref_epodoc, _ = _get_document_number_date(pubref, 'epodoc')

        # All publication numbers in DOCDB format
        representation_pubrefs_docdb = []
        for cycle in cycles:
            pubref = pointer_publication_reference.resolve(cycle)
            representation_pubref_docdb, _ = _get_document_number_date(pubref, 'docdb')
            representation_pubrefs_docdb.append(representation_pubref_docdb)

        # Debugging
        #print 'representation_pubref_epodoc:', representation_pubref_epodoc
        #print 'representation_pubrefs_docdb:', representation_pubrefs_docdb

        # Fetch family members. When failing, use first cycle as representation.
        try:
            family_info = ops_family_members(representation_pubref_epodoc)
        except:
            log.warning('Failed to fetch family information for %s', representation_pubref_epodoc)
            chunk['exchange-document'] = representation
            request = get_current_request()
            del request.errors[:]
            continue

        #members = family_info.publications_by_country()
        #pprint(members)

        # Find replacement from list of family members controlled by priority list.
        for prio in priorities:

            filter = prio['filter']

            # Debugging
            #print 'checking prio:', filter

            if match_filter(representation_pubref_epodoc, filter):
                break

            bibdata = None
            found = False
            for member in family_info.items:

                # Debugging
                #print 'member:'; pprint(member)

                member_pubnum = member['publication']['number-docdb']

                if match_filter(member_pubnum, filter):

                    # Debugging
                    #print 'Filter matched for member:', member_pubnum

                    try:
                        bibdata = ops_biblio_documents(member_pubnum)
                    except:
                        #log.warning('Fetching bibliographic data failed for %s', member_pubnum)
                        request = get_current_request()
                        del request.errors[:]
                        continue

                    #pprint(bibdata)
                    if bibdata:

                        # TODO: Add marker that this document was swapped, display appropriately.
                        found = True
                        break

            # Swap representation of document by appropriate family member
            # and set a marker in the data structure containing the original
            # document number(s).
            if found:

                representation = bibdata
                #print 'representation:'; pprint(representation)

                representation[0].setdefault('__meta__', {})
                representation[0]['__meta__']['swapped'] = {
                    'canonical': representation_pubrefs_docdb[0],
                    'list': [representation_pubref_epodoc] + representation_pubrefs_docdb,
                    }

                break

        # TODO: Here, duplicate documents might be. Prune/deduplicate them.
        # TODO: When choosing german family members (e.g. for EP666666), abstract is often missing.
        # TODO: => Carry along from original representation.

        """
        for result in cycles:
            #pprint(result)
            pubref = pointer_publication_reference.resolve(result)
            #print entry, pubref
            pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb')
            publication_numbers.append(pubref_number)
        """

        chunk['exchange-document'] = representation

    # Filter duplicates
    seen = []
    results = []
    fields = ['@country', '@doc-number', '@kind', '@family-id']
    for chunk in chunks:

        # Prepare list of document cycles.
        cycles = to_list(chunk['exchange-document'])

        # Only look at first cycle slot.
        doc = cycles[0]

        # Compute unique document identifier.
        ident = {}
        for key in fields:
            ident[key] = doc[key]

        # Collect chunk if not seen yet.
        if ident in seen:
            continue
        else:
            seen.append(ident)
            results.append(chunk)

    # Overwrite reduced list of chunks in original DOM.
    pointer_results.set(response, results)

    return publication_numbers
Exemplo n.º 2
0
def createIndexData(offset, stepindex, valueindex):
    pointer = JsonPointer(offset)
    pointer.resolve(stepindex)['_offset'] = offset
    pointer.set(valueindex, {})
Exemplo n.º 3
0
def ops_published_data_crawl(constituents, query, chunksize):

    if constituents != 'pub-number':
        raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents))

    real_constituents = constituents
    if constituents == 'pub-number':
        constituents = ''

    # fetch first chunk (1-chunksize) from upstream
    first_chunk = ops_published_data_search(constituents, query, '1-{0}'.format(chunksize))
    #print first_chunk

    pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count')
    total_count = int(pointer_total_count.resolve(first_chunk))
    log.info('ops_published_data_crawl total_count: %s', total_count)

    # The first 2000 hits are accessible from OPS.
    total_count = min(total_count, 2000)

    # collect upstream results
    begin_second_chunk = chunksize + 1
    chunks = [first_chunk]
    for range_begin in range(begin_second_chunk, total_count + 1, chunksize):

        # countermeasure to robot flagging
        # <code>CLIENT.RobotDetected</code>
        # <message>Recent behaviour implies you are a robot. The server is at the moment busy to serve robots. Please try again later</message>
        time.sleep(5)

        range_end = range_begin + chunksize - 1
        range_string = '{0}-{1}'.format(range_begin, range_end)
        log.info('ops_published_data_crawl range: ' + range_string)
        chunk = ops_published_data_search(constituents, query, range_string)
        #print 'chunk:', chunk
        chunks.append(chunk)

    #return chunks

    # merge chunks into single result
    """
    <empty>:    "ops:search-result" { » "ops:publication-reference": [
    biblio:     "ops:search-result" { » "exchange-documents": [ » "exchange-document": {
    abstract:   "ops:search-result" { » "exchange-documents": [ » "exchange-document": {
    full-cycle: "ops:search-result" { » "exchange-documents": [ » "exchange-document": [
    pub-number: "ops:search-result" { » "ops:publication-reference": [
                        {
                            "@family-id": "6321653",
                            "@system": "ops.epo.org",
                            "document-id": {
                                "@document-id-type": "docdb",
                                "country": {
                                    "$": "DE"
                                },
                                "doc-number": {
                                    "$": "3705908"
                                },
                                "kind": {
                                    "$": "A1"
                                }
                            }
                        },
    """
    pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference')
    #pointer_time_elapsed = JsonPointer('/ops:world-patent-data/ops:meta/@value')
    all_results = []
    #time_elapsed = int(pointer_time_elapsed.resolve(first_chunk))
    for chunk in chunks:

        # FIXME: use this for "real_constituents == 'pub-number'" only
        chunk_results = to_list(pointer_results.resolve(chunk))

        # FIXME: implement other constituents

        #print 'chunk_results:', chunk_results
        all_results += chunk_results

        #time_elapsed += int(pointer_time_elapsed.resolve(chunk))

    response = None
    if real_constituents == 'pub-number':

        response = first_chunk

        # delete upstream data
        del resolve_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result')['ops:publication-reference']

        # compute own representation
        publication_numbers = []
        pointer_document_id = JsonPointer('/document-id')
        for entry in all_results:
            pubref = pointer_document_id.resolve(entry)
            #print entry, pubref
            pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb')
            publication_numbers.append(pubref_number)

        # add own representation
        set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result/publication-numbers', publication_numbers, inplace=True)

        # amend metadata
        new_total_count = str(len(publication_numbers))
        pointer_total_count.set(response, new_total_count)
        set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:range', {'@begin': '1', '@end': new_total_count})
        #pointer_time_elapsed.set(response, str(time_elapsed))

    if not response:
        raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents))

    return response
Exemplo n.º 4
0
def createIndexData(offset, stepindex, valueindex):
    pointer = JsonPointer(offset)
    pointer.resolve(stepindex)['_offset'] = offset
    pointer.set(valueindex, {})