def test_fetch_bad_status_code(self): ''' Fetch a URL that returns a bad status code ''' responses.add(responses.GET, 'http://example.com', status=404, content_type='text/html') with self.assertRaises(HTTPError): fetch('http://example.com')
def save(self, *args, **kwargs): ''' check to see if title or description are empty, and scrape them if necessary ''' # TODO: make this async try: f = fetch(URL=self.url) except HTTPError: return super(Link, self).save(*args, **kwargs) if self.title == '': self.title = f.title if self.description == '': self.description = f.description return super(Link, self).save(*args, **kwargs)
def test_fetch_without_data(self): ''' Fetch with a domain that doesn't have a title or description ''' responses.add(responses.GET, 'http://example.com', body='', status=200, content_type='text/html') f = fetch('http://example.com') self.assertEqual('', f.title) self.assertEqual('', f.description)
def test_fetch_recieves_data(self): ''' Fetch with a domain that has both a title and a description ''' body = ''' <title>Example</title> <meta content="An example description" name="description"> ''' responses.add(responses.GET, 'http://example.com', body=body, status=200, content_type='text/html') f = fetch('http://example.com') self.assertEqual('Example', f.title) self.assertEqual('An example description', f.description)
def stew(self, _url): if _url != None: response = fetcher.fetch(self.addSchema(_url), self.proxy) self.htmltext = response.text self.content = response.content self.status = response.status_code self.code_reason = response.reason self.time_cost = response.elapsed self.url = response.url self.ok = response.ok self.encoding = response.encoding self.cookie = response.cookies self.header = response.headers self.is_redirect = response.is_redirect self.history = response.history self.soup = BeautifulSoup(self.content, 'html.parser') return self.cleanMe(self.soup) return None
def __init__(self, _url=None, proxy=None): super(webparser, self).__init__() self.url = _url self.proxy = proxy if _url is None: return response = fetcher.fetch(self.addSchema(_url), self.proxy) self.htmltext = response.text self.content = response.content self.status = response.status_code self.code_reason = response.reason self.time_cost = response.elapsed self.url = response.url #print self.url self.ok = response.ok self.encoding = response.encoding self.cookie = response.cookies self.header = response.headers self.is_redirect = response.is_redirect self.history = response.history
def test_fetcher(self, session): for i in self.input_data(): url, content, status = i.split("\t") status = int(status) dm = DocumentMetadata(url) mr = MockResponse(content, status) session.return_value = mr new_dm = fetcher.fetch({}, dm, 1) self.assertEqual(dm.url, url) if status == 200: self.assertEqual(dm.status, 0) self.assertEqual(new_dm.status, 0) self.assertEqual(new_dm.response.content, content) elif status == 301: self.assertEqual(dm.status, fetcher.Status.SkipUrl) elif status == 0: self.assertEqual(dm.status, fetcher.Status.ConnectionError) elif status >= 500 and status < 510: self.assertEqual(dm.status, fetcher.Status.SkipUrl) else: self.assertEqual(dm.status, fetcher.Status.GenericError)
def main(): query, db = get_arguments() if db == "psql": conn = psycopg2.connect( database=env("DATABASE_NAME"), user=env("DATABASE_USER"), host=env("DATABASE_URL"), password=env("DATABASE_PASSWORD")) elif db == "sqlite": sqlite_path = base_path + "/" + env("SQLITE_NAME") conn = sqlite3.connect(sqlite_path) else: return total = 0 count_fetched = 0 total_count_book_inserted = 0 total_count_book_updated = 0 total_count_category_inserted = 0 while count_fetched <= total: data = fetcher.fetch(query, count_fetched, env('GOOGLE_BOOKS_API_KEY')) books = data.get('items', None) if books is None: break count_book_inserted, count_book_updated, count_category_inserted = save(db, conn, books) total_count_book_inserted += count_book_inserted total_count_book_updated += count_book_updated total_count_category_inserted += count_category_inserted total = data.get('totalItems', 0) count_fetched += len(books) print(str(count_fetched) + ' / ' + str(total)) conn.close() print(str(total_count_book_inserted) + " books inserted!") print(str(total_count_book_updated) + " books updated!") print(str(total_count_category_inserted) + " categories inserted!")
def start(self): """ Start the crawling phase. The job continues until a sigterm is cought. """ # starting urls end up in priority self.queue.init_priority_list(self.spider.start_urls) self.queue.add_bootstrap_urls(self.spider.urllist) while not GracefulKiller.kill_now: dmeta = self.queue.pop() dmeta.spider = self.spider.name if dmeta.url: # in case of changing spider filters it is better to recheck nurl, toremove = self.spider.check_and_normalize(dmeta.url) if toremove: self.queue.remove_seen(dmeta.url) # INFO: in case of normalization we want to fetch the url # but we want to discard other cases. if nurl == dmeta.url: continue dmeta.url = nurl dmeta.alternatives = [nurl] dmeta = fetcher.fetch(self.spider.headers, dmeta) if dmeta.status == fetcher.Status.ConnectionError: # CHECK: check if this is still correct self.queue.add_seen_and_reschedule(dmeta) elif dmeta.response: r_url = self.spider.normalize_url(dmeta.response.url) dmeta.alternatives.append(r_url) document, dmeta = self.spider.parse(dmeta) self.queue.add_normal_urls(dmeta) # INFO: in case of status != 200 previous data will not be overwrited self.documentStore.store(document) self.queue.add_seen_and_reschedule(dmeta) self.sleep.wait(self.spider.delay)