Пример #1
0
def find_patent_by_id(patent_id):
    """
    @param patent_id: String with patent id
    @return patent dictionary
    """
    scraper = scraper_class()

    status, soup, url = scraper.request_single_patent(patent_id)
    patent_parsed = scraper.process_patent_html(soup)
    return patent_parsed
Пример #2
0
def single_process_scraper(patent,path_to_data_file,data_column_order):
    """Scrapes a single google patent using the google scraper class
       
       Function does not return any values, instead it writes the output
         of the data into a csv file specified in the path_to_data_file
         parameter

       Inputs:
         patent (str) : patent number including country prefix
         lock (obj) : to prevent collisions, function uses a lock. You can pass whichever
                      lock you want to this parameter
         path_to_data_file : absolute path to csv file to write data to
         data_column_order : name of columns in order they will be saved in csv file

    """
    # ~ Initialize scraper class ~ #
    scraper=scraper_class() 

    # ~ Scrape single patent ~ #
    err, soup, url = scraper.request_single_patent(patent)

    # Checks if the scrape is successful.
    # If successful -> parse text and deposit into csv file
    # Else          -> print error statement

    if err=='Success':
        patent_parsed = scraper.get_scraped_data(soup,url,patent)
        # print("## Patent Parsed:")
        out_row = ""
        for key in data_column_order:
            out_row = out_row + patent_parsed.get(key) + ","

        out_row = out_row[:-1] + "\n"
        
        with open(path_to_data_file, 'a') as file:
            file.write(out_row)
            
    else:
        print('Patent {0} has error code {1}'.format(patent,err))
Пример #3
0
def single_process_scraper(patent, path_to_data_file, data_column_order):
    """Scrapes a single google patent using the google scraper class
       
       Function does not return any values, instead it writes the output
         of the data into a csv file specified in the path_to_data_file
         parameter

       Inputs:
         patent (str) : patent number including country prefix
         lock (obj) : to prevent collisions, function uses a lock. You can pass whichever
                      lock you want to this parameter
         path_to_data_file : absolute path to csv file to write data to
         data_column_order : name of columns in order they will be saved in csv file

    """
    # ~ Initialize scraper class ~ #
    scraper = scraper_class()

    # ~ Scrape single patent ~ #
    err, soup, url = scraper.request_single_patent(patent)

    # Checks if the scrape is successful.
    # If successful -> parse text and deposit into csv file
    # Else          -> print error statement

    if err == 'Success':
        patent_parsed = scraper.get_scraped_data(soup, url, patent)

        # Save the parsed data to a csv file
        #  using multiprocessing lock function
        #  to prevent collisions
        with lock:
            with open(path_to_data_file, 'a', newline='') as ofile:
                writer = csv.DictWriter(ofile, fieldnames=data_column_order)
                writer.writerow(patent_parsed)
    else:
        print('Patent {0} has error code {1}'.format(patent, err))
Пример #4
0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# Example 1: Scrape a single patent
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

# ~ Import packages ~ #
from google_patent_scraper import scraper_class

# ~ Initialize scraper class ~ #
scraper = scraper_class()

# ~~ Scrape patents individually ~~ #
patent_1 = 'US2668287A'
patent_2 = 'US266827A'
err_1, soup_1, url_1 = scraper.request_single_patent(patent_1)
err_2, soup_2, url_2 = scraper.request_single_patent(patent_2)

# ~ Parse results of scrape ~ #
patent_1_parsed = scraper.get_scraped_data(soup_1, patent_1, url_1)
patent_2_parsed = scraper.get_scraped_data(soup_2, patent_2, url_2)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# Example 2: Scrape a list of patents
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

# ~ Import packages ~ #
from google_patent_scraper import scraper_class
import json

# ~ Initialize scraper class ~ #
scraper = scraper_class()
Пример #5
0
import csv
import sys
import re
import datetime
import google_patent_scraper
import json

csv.field_size_limit(sys.maxsize)

with open('combined.txt', encoding='utf-16-le') as tsvfile:
    reader = csv.DictReader(tsvfile, dialect='excel-tab')

    scraper = google_patent_scraper.scraper_class()
    i = 0
    rows = []
    for row in reader:
        rows.append(row)
        patent_numbers = re.findall("(US[0-9]+)", row['\ufeffPN '])
        patent_number = patent_numbers[0]
        for s in patent_numbers[1:]:
            if len(s) < len(patent_number):
                patent_number = s
        if len(patent_number) > 10:
            continue
        scraper.add_patents(patent_number)
        i += 1
        #if i == 5:
        #    break
    
    scraper.scrape_all_patents()
Пример #6
0
# ~ Import packages ~ #
from google_patent_scraper import scraper_class
import json

# ~ Initialize scraper class ~ #
scraper = scraper_class(return_abstract=True)  #<- TURN ON ABSTRACT TEXT

# ~~ Scrape patents individually ~~ #
patent_1 = 'US20160180719A1'
err_1, soup_1, url_1 = scraper.request_single_patent(patent_1)

# ~ Parse results of scrape ~ #
patent_1_parsed = scraper.get_scraped_data(soup_1, patent_1, url_1)

for inventor in json.loads(patent_1_parsed['inventor_name']):
    print(inventor['inventor_name'])

print(patent_1_parsed["grant_date"])

print(patent_1_parsed["patent"])
print(patent_1_parsed["abstract_text"])

# print(patent_1_parsed["image_urls"])

for link in patent_1_parsed['image_urls']:
    print(link)

#print(patent_1_parsed)

#
#https://patentimages.storage.googleapis.com/ec/41/7b/76047542d0f945/US20160180719A1-20160623-D00003.png.  <full size