示例#1
0
def test_getterTypes():
    testurl = 'https://news.ycombinator.com/news'
    getter0 = Getter('urlopen')
    html0 = getter0.get_html(testurl)
    assert isinstance(html0, bytes)

    getter1 = Getter('chromedriver')
    html1 = getter1.get_html(testurl)
    assert isinstance(html1, str)

    getter2 = Getter('requests')
    html2 = getter2.get_html(testurl)
    assert isinstance(html2, bytes)
示例#2
0
class WebStash:
    def __init__(self, getterType='urlopen', waitTimeBeforeScraping=0):
        self.cacher = Cacher()
        self.config = Config()
        self.getter = Getter(getterType,
                             waitTimeBeforeScraping=waitTimeBeforeScraping)

    def get_web_data(self, url):
        try:
            return self.cacher[url]
        except KeyError:
            self.config.debugPrint('Getting webData...')
            filename = self.cacher.getFilename(url)
            html = self.getter.get_html(url)
            screenshotLocation = self.getter.get_screenshot(
                url, filename + '.png')
            webData = WebData(filename,
                              url,
                              html,
                              screenshotLocation=screenshotLocation)
            self.cacher[url] = webData
            return self.cacher[url]

    def delete(url):
        del self.cacher[url]

    def clean(self):
        self.cacher.clean()
示例#3
0
def test_getter_wait_before_scraping():
    import datetime
    waitTimeBeforeScraping = 1
    testSleep = Getter('urlopen',
                       waitTimeBeforeScraping=waitTimeBeforeScraping)
    startTime = datetime.datetime.now()
    for i in range(3):
        testSleep.get_html('https://news.ycombinator.com/news')
    endTime = datetime.datetime.now()

    assert (endTime - startTime).seconds > 3 * waitTimeBeforeScraping

    try:
        errorgetter = Getter('this is not a getter type')
    except GetterImplementationError as e:
        assert str(
            e) == 'this is not a getter type is not a supported getter type'