def test_remove_dupes(url): params = extract_params(url) validated_url = validate_url(url) validated_params = extract_params(validated_url) assert len(params) == len(validated_params) for p in params: assert (str(p) + "=" + str(params[p])) in validated_url
def test_remove_params(url, params_to_remove): validated_url = validate_url(url, params_to_remove) url_params = extract_params(url) for p in params_to_remove: if p in url_params: del url_params[p] else: pass validated_params = extract_params(validated_url) assert len(url_params) == len(validated_params) for u in url_params: assert u in validated_params
def _deploy(self, job_id): """ Deploy a spider to crawl the web. Use the DeploymentManager's enqueue method to specify which URLs to crawl. Depth should be assigned to each submitted URL prior to deployment. Arguments: job_id: intefer job id. Returns: None """ if data.job_is_aborted(job_id): self._active = False self._queue = [] return self._active = True queue_copy = self._queue[:] for index, url in enumerate(queue_copy): if data.job_is_aborted(job_id): break self._queue.remove(url) validated_url = validate_url(url) url = validated_url['url'] webpage_info = data.get_webpage_info(url) if not claim(url): continue if not validated_url['valid']: continue # Ignore webpages crawled less than 15 min ago. if self._less_than_15_min_ago(webpage_info['completion_datetime']): continue # Database latency means depth is occasionally still unavailable. if not webpage_info['depth']: # Child URLs with no job_id and no depth have been deleted. if bool(data.redis.llen('reg:' + url)): data.redis.set(url, 'ready') self._queue.append(url) continue depth = webpage_info['depth'] - 1 self._set_job_status(job_id, depth, index, len(queue_copy)) self._fetch_and_parse(job_id, url, depth) time.sleep(self.delay) if data.job_is_aborted(job_id): self._active = False self._queue = [] else: if len(self._queue): time.sleep(self.delay) self._deploy(job_id) else: self._set_job_status(job_id, -1, -1, 0, 'Complete') self._active = False
def test_convert_domain_suffix(url, expected): assert validate_url(url) == expected
def test_remove_non_existant_params(): url = 'www.austintexas.gov?a=1&b=2&foo=bar&3=5&4=cats' validated_url = validate_url(url, ['cookie']) url_params = extract_params(url) validated_params = extract_params(validated_url) assert len(url_params) == len(validated_params)
def test_identity(): assert validate_url("www.austintexas.gov") == "www.austintexas.gov"
def handle_a(self, href): if href: validated_href = validate_url(href, self.url) if validated_href['valid']: target = urldefrag(urljoin(self.url, validated_href['url']))[0] self.hyperlinks.append(target)