def find_patent_by_id(patent_id): """ @param patent_id: String with patent id @return patent dictionary """ scraper = scraper_class() status, soup, url = scraper.request_single_patent(patent_id) patent_parsed = scraper.process_patent_html(soup) return patent_parsed
def single_process_scraper(patent,path_to_data_file,data_column_order): """Scrapes a single google patent using the google scraper class Function does not return any values, instead it writes the output of the data into a csv file specified in the path_to_data_file parameter Inputs: patent (str) : patent number including country prefix lock (obj) : to prevent collisions, function uses a lock. You can pass whichever lock you want to this parameter path_to_data_file : absolute path to csv file to write data to data_column_order : name of columns in order they will be saved in csv file """ # ~ Initialize scraper class ~ # scraper=scraper_class() # ~ Scrape single patent ~ # err, soup, url = scraper.request_single_patent(patent) # Checks if the scrape is successful. # If successful -> parse text and deposit into csv file # Else -> print error statement if err=='Success': patent_parsed = scraper.get_scraped_data(soup,url,patent) # print("## Patent Parsed:") out_row = "" for key in data_column_order: out_row = out_row + patent_parsed.get(key) + "," out_row = out_row[:-1] + "\n" with open(path_to_data_file, 'a') as file: file.write(out_row) else: print('Patent {0} has error code {1}'.format(patent,err))
def single_process_scraper(patent, path_to_data_file, data_column_order): """Scrapes a single google patent using the google scraper class Function does not return any values, instead it writes the output of the data into a csv file specified in the path_to_data_file parameter Inputs: patent (str) : patent number including country prefix lock (obj) : to prevent collisions, function uses a lock. You can pass whichever lock you want to this parameter path_to_data_file : absolute path to csv file to write data to data_column_order : name of columns in order they will be saved in csv file """ # ~ Initialize scraper class ~ # scraper = scraper_class() # ~ Scrape single patent ~ # err, soup, url = scraper.request_single_patent(patent) # Checks if the scrape is successful. # If successful -> parse text and deposit into csv file # Else -> print error statement if err == 'Success': patent_parsed = scraper.get_scraped_data(soup, url, patent) # Save the parsed data to a csv file # using multiprocessing lock function # to prevent collisions with lock: with open(path_to_data_file, 'a', newline='') as ofile: writer = csv.DictWriter(ofile, fieldnames=data_column_order) writer.writerow(patent_parsed) else: print('Patent {0} has error code {1}'.format(patent, err))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Example 1: Scrape a single patent # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # ~ Import packages ~ # from google_patent_scraper import scraper_class # ~ Initialize scraper class ~ # scraper = scraper_class() # ~~ Scrape patents individually ~~ # patent_1 = 'US2668287A' patent_2 = 'US266827A' err_1, soup_1, url_1 = scraper.request_single_patent(patent_1) err_2, soup_2, url_2 = scraper.request_single_patent(patent_2) # ~ Parse results of scrape ~ # patent_1_parsed = scraper.get_scraped_data(soup_1, patent_1, url_1) patent_2_parsed = scraper.get_scraped_data(soup_2, patent_2, url_2) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Example 2: Scrape a list of patents # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # ~ Import packages ~ # from google_patent_scraper import scraper_class import json # ~ Initialize scraper class ~ # scraper = scraper_class()
import csv import sys import re import datetime import google_patent_scraper import json csv.field_size_limit(sys.maxsize) with open('combined.txt', encoding='utf-16-le') as tsvfile: reader = csv.DictReader(tsvfile, dialect='excel-tab') scraper = google_patent_scraper.scraper_class() i = 0 rows = [] for row in reader: rows.append(row) patent_numbers = re.findall("(US[0-9]+)", row['\ufeffPN ']) patent_number = patent_numbers[0] for s in patent_numbers[1:]: if len(s) < len(patent_number): patent_number = s if len(patent_number) > 10: continue scraper.add_patents(patent_number) i += 1 #if i == 5: # break scraper.scrape_all_patents()
# ~ Import packages ~ # from google_patent_scraper import scraper_class import json # ~ Initialize scraper class ~ # scraper = scraper_class(return_abstract=True) #<- TURN ON ABSTRACT TEXT # ~~ Scrape patents individually ~~ # patent_1 = 'US20160180719A1' err_1, soup_1, url_1 = scraper.request_single_patent(patent_1) # ~ Parse results of scrape ~ # patent_1_parsed = scraper.get_scraped_data(soup_1, patent_1, url_1) for inventor in json.loads(patent_1_parsed['inventor_name']): print(inventor['inventor_name']) print(patent_1_parsed["grant_date"]) print(patent_1_parsed["patent"]) print(patent_1_parsed["abstract_text"]) # print(patent_1_parsed["image_urls"]) for link in patent_1_parsed['image_urls']: print(link) #print(patent_1_parsed) # #https://patentimages.storage.googleapis.com/ec/41/7b/76047542d0f945/US20160180719A1-20160623-D00003.png. <full size