def get_scraper(
        url: str,
        scrapers_file_name: str = 'scrapers.json') -> Union[None, Scraper]:
    domain = get_domain(url)
    with open(get_file_path(scrapers_file_name), 'r') as scrapers_file:
        scrapers_json = json.load(scrapers_file)
        if domain in scrapers_json:
            scraper_file_name = scrapers_json[domain]
        else:
            return None

    with open(get_file_path(scraper_file_name), 'r') as scraper_file:
        return Scraper.fromfile(scraper_file)
Пример #2
0
    def test_train_store_load_scrape(self):
        url1 = "http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/"
        data = {"name": "Copper Shade by Tom Dixon", "designer": "Tom Dixon", "price": "320"}
        s = Scraper()
        s.train(url1, data, encoding="latin1")

        f = StringIO()
        s.tofile(f)

        f.seek(0)
        s = Scraper.fromfile(f)

        url2 = "http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/"
        data = s.scrape(url2, encoding="latin1")
        self.assertEqual(sorted(data[0].keys()), ["designer", "name", "price"])
Пример #3
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)