Пример #1
0
    def test_train_store_load_scrape(self):
        url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/"
        data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"}
        s = Scraper()
        s.train(url1, data, encoding="latin1")

        f = StringIO()
        s.tofile(f)

        f.seek(0)
        s = Scraper.fromfile(f)

        url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/"
        data = s.scrape(url2, encoding="latin1")
        self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
Пример #2
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)
Пример #3
0
def update_scrapers_file(url):
    domain = re.search(r'(?<=\/\/)[\w\.-]+(?=\/)', url).group()
    scraper_file_name = ""
    scrapers_json = {}
    with open('scrapers.json', 'r') as scrapers_file:
        scrapers_json = json.load(scrapers_file)

    scraper_file_name = domain + ".json"
    scrapers_json[domain] = scraper_file_name
    with open('scrapers.json', 'w') as scrapers_file:
        json.dump(scrapers_json, scrapers_file)

    return scraper_file_name


# TODO add help and verbose modes
# TODO add arg validation and error feedback
scraper = Scraper()
training_params = open_training_file()
assert training_params, "no training parameters found in {}".format(
    sys.argv[1])
url = training_params['url']
params = training_params['params']
scraper.train(url, params)
# TODO replace this with database action and maybe do checksum compare to avoid writing same scraper more than once?
scraper_file_name = update_scrapers_file(url)

with open(scraper_file_name, 'w') as scraper_file:
    scraper.tofile(scraper_file)