def get_pdf_page(country, patnum, kindcode, page, folder, appendix=""): """ Retrieve given page from a patent document and store it to the given folder. Appendix is added to the stored filename. """ r = urllib2.Request("http://ops.epo.org/3.1/rest-services/" "published-data/images/" + str(country) + "/" + str(patnum) + "/" + str(kindcode) + "/" + "fullimage.pdf?Range=" + str(page)) r.add_header('Authorization', 'Bearer ' + auth.get_token()) name = folder + "/" + str(country) + "." + str(patnum) +\ "-" + appendix + ".pdf" try: opener = urllib2.build_opener() f = opener.open(r) data = f.read() f.close() opener.close() f = open(name, "wb") f.write(data) f.close() return kindcode except urllib2.HTTPError, e: print e print "http://ops.epo.org/3.1/rest-services/published-data/images/" +\ str(country) + "/" +\ str(patnum) + "/" +\ str(kindcode) + "/" +\ "fullimage.pdf?Range=" + str(page) print "An exception has occured." return False
def download_from_epo(base_patent, cited_patent): """ Retrieve given page from a patent document and store it to the given folder. Appendix is added to the stored filename. """ kindcode, patnum = cited_patent.split('.') r = urllib2.Request(config.xml_url.replace("{patnum}", kindcode + patnum)) r.add_header('Authorization', 'Bearer ' + auth.get_token()) print "Downloading " + cited_patent + "..." # Filename (including path) to store xml. filename = config.data_dir + base_patent + '/' + cited_patent + '/' + cited_patent + '.xml' if os.path.isfile(filename): print 'XML for ' + cited_patent + ' already downloaded.' return try: opener = urllib2.build_opener() f = opener.open(r) xml = f.read() root = et.fromstring(xml) tree = et.ElementTree(root) tree.write(filename) f.close() opener.close() except: print 'Could not retrieve XML for ' + cited_patent
def get_searchreport_pdf(meta, output_filename): """ Try to retrieve all Search report pages as pdf files. The meta object stores the link as well as the start page of the search report section of the given patent number. """ url = config.epo_rest_url + meta["link"] page = meta["SEARCH_REPORT"] while page <= meta["total_pages"]: download_url = url + "?Range=" + str(page) name = output_filename + "-Page-" + str(page) + ".pdf" r = requests.get(download_url, headers={ 'Authorization': 'Bearer ' + auth.get_token() }) f = open(name, "wb") f.write(r.content) f.close() page += 1
def get_meta_data(country, patnum, section_name='SEARCH_REPORT', kindcode=False, skip=False): """ Get the meta data for a certain document section (e.g. DESCRIPTION, SEARCH_REPORT). The meta data does include information on the sections of the document and is used to enable us to only download given sections/pages. kind - get meta for given kindcode """ url = config.meta_url.replace("{country}", country) url = url.replace("{patnum}", patnum) r = urllib2.Request(url) r.add_header('Authorization', 'Bearer ' + auth.get_token()) try: opener = urllib2.build_opener() data = json.loads(opener.open(r).read()) except: return False meta = False inquiry_result = data["ops:world-patent-data"]["ops:document-inquiry"]["ops:inquiry-result"] result = inquiry_result if type(inquiry_result) == list else [inquiry_result] print result for entry in result: instances = entry['ops:document-instance'] instances = instances if type(instances) == list else [instances] kind = entry['publication-reference']['document-id']['kind']['$'] if skip and kindcode and kind != kindcode: continue for instance in instances: if meta: break if instance['@desc'] == 'FullDocument': try: sections = instance['ops:document-section'] except: sections = [] for section in sections: if section['@name'] == section_name: meta = {} meta[section_name] = int(section["@start-page"]) meta["link"] = instance["@link"] meta['total_pages'] = int(instance["@number-of-pages"]) break return meta