def work(self): """ :inheritdoc: """ try: job = self.jobs.reserve_job(self.queue) if job is False: return False url = Url.find(job.payload['url_id']) if not url or not can_crawl_url(url): self.jobs.clear_job(job) return False response = self.fetch(url) doc = Document.from_response(response, url) doc.purge_docs_for_url(url) doc.insert() if doc.can_index: doc.discover_urls() doc.discover_excerpts() doc.discover_images() # Schedule the job to be repeated after some period of time recrawl_at = datetime.now() + self.repeat_delta self.jobs.reschedule_job(job, recrawl_at) except Exception as e: # Release the job back on to the queue if an error occurs self.jobs.release_job(job) print("Releasing job %d because an exception occurred" % job.id) raise e
def discover_urls(self): """ Discover URL's in the document and save them in the database. """ allowed_domains = get_allowed_domains() def is_allowed(u): return u.domain() in allowed_domains or u.domain() == '' insert_count = 0 cursor = self.db.cursor() for link in self.soup.find_all('a'): url = Url(url=link.get('href'), base=self.url) if is_allowed(url): url.insert_bare(cursor) insert_count += 1 if insert_count > 0: print("Discovered %d new URLs" % insert_count) self.db.commit() cursor.close()
def test_default_url_parsing(): location = 'https://syntaxleiden.nl/foo' u = Url(url=location) assert_equal(u.geturl(), location)
def test_relative_url_parsing_with_scheme(): location = '/foo' u = Url(url=location, base='https://syntaxleiden.nl') assert_equal(u.geturl(), 'https://syntaxleiden.nl/foo')
def add_url(): if request.method == 'POST': url = Url(url=request.form['url']) return render_template('add_url.html', added=url.insert(), url=url) else: return render_template('add_url.html', added=None)