Exemplo n.º 1
0
def get_pdf_page(country, patnum, kindcode, page, folder, appendix=""):
    """ Retrieve given page from a patent document and store it to the given folder.
    Appendix is added to the stored filename.
    """
    r = urllib2.Request("http://ops.epo.org/3.1/rest-services/"
                        "published-data/images/" +
                        str(country) + "/" +
                        str(patnum) + "/" +
                        str(kindcode) + "/" +
                        "fullimage.pdf?Range=" + str(page))
    r.add_header('Authorization', 'Bearer ' + auth.get_token())
    name = folder + "/" + str(country) + "." + str(patnum) +\
           "-" + appendix + ".pdf"
    try:
        opener = urllib2.build_opener()
        f = opener.open(r)
        data = f.read()
        f.close()
        opener.close()
        f = open(name, "wb")
        f.write(data)
        f.close()
        return kindcode
    except urllib2.HTTPError, e:
        print e
        print "http://ops.epo.org/3.1/rest-services/published-data/images/" +\
                        str(country) + "/" +\
                        str(patnum) + "/" +\
                        str(kindcode) + "/" +\
                        "fullimage.pdf?Range=" + str(page)
        print "An exception has occured."
        return False
Exemplo n.º 2
0
def download_from_epo(base_patent, cited_patent):
    """ Retrieve given page from a patent document and store it to the given folder.
    Appendix is added to the stored filename.
    """
    kindcode, patnum = cited_patent.split('.')
    r = urllib2.Request(config.xml_url.replace("{patnum}", kindcode + patnum))
    r.add_header('Authorization', 'Bearer ' + auth.get_token())
    print "Downloading " + cited_patent + "..."

    # Filename (including path) to store xml.
    filename = config.data_dir + base_patent + '/' + cited_patent + '/' + cited_patent + '.xml'

    if os.path.isfile(filename):
        print 'XML for ' + cited_patent + ' already downloaded.'
        return

    try:
        opener = urllib2.build_opener()
        f = opener.open(r)
        xml = f.read()
        root = et.fromstring(xml)
        tree = et.ElementTree(root)
        tree.write(filename)
        f.close()
        opener.close()
    except:
        print 'Could not retrieve XML for ' + cited_patent
Exemplo n.º 3
0
def get_searchreport_pdf(meta, output_filename):
    """ Try to retrieve all Search report pages as pdf files.
    The meta object stores the link as well as the start page of
    the search report section of the given patent number.
    """
    url = config.epo_rest_url + meta["link"]
    page = meta["SEARCH_REPORT"]
    while page <= meta["total_pages"]:
        download_url = url + "?Range=" + str(page)
        name = output_filename + "-Page-" + str(page) + ".pdf"
        r = requests.get(download_url, headers={
            'Authorization': 'Bearer ' + auth.get_token()
        })
        f = open(name, "wb")
        f.write(r.content)
        f.close()
        page += 1
Exemplo n.º 4
0
def get_meta_data(country, patnum, section_name='SEARCH_REPORT', kindcode=False, skip=False):
    """ Get the meta data for a certain document section (e.g. DESCRIPTION, SEARCH_REPORT).
    The meta data does include information on the sections of the document and is used
    to enable us to only download given sections/pages.
    kind - get meta for given kindcode
    """
    url = config.meta_url.replace("{country}", country)
    url = url.replace("{patnum}", patnum)
    r = urllib2.Request(url)
    r.add_header('Authorization', 'Bearer ' + auth.get_token())

    try:
        opener = urllib2.build_opener()
        data = json.loads(opener.open(r).read())
    except:
        return False

    meta = False
    inquiry_result = data["ops:world-patent-data"]["ops:document-inquiry"]["ops:inquiry-result"]
    result = inquiry_result if type(inquiry_result) == list else [inquiry_result]
    print result
    for entry in result:
        instances = entry['ops:document-instance']
        instances = instances if type(instances) == list else [instances]
        kind = entry['publication-reference']['document-id']['kind']['$']
        if skip and kindcode and kind != kindcode:
            continue
        for instance in instances:
            if meta:
                break
            if instance['@desc'] == 'FullDocument':
                try:
                    sections = instance['ops:document-section']
                except:
                    sections = []
                for section in sections:
                    if section['@name'] == section_name:
                        meta = {}
                        meta[section_name] = int(section["@start-page"])
                        meta["link"] = instance["@link"]
                        meta['total_pages'] = int(instance["@number-of-pages"])
                        break

    return meta