def __init__(self, start_url, sitemap_url=None): self.visited_urls = set() self.intermediate_urls = set() self.logger = logging.getLogger(__name__) self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.non_visited_urls = { _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)} self.added_count = 1 self.idle_ping = 0 # self.coop = task.Cooperator() self.start_idle_counter = False self.sitemap_url = '{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER): self.visited_urls = set() self.intermediate_urls = set() self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED) self.non_visited_urls = {self.base_page} self.added_count = 1 self.idle_ping = 0 self.start_idle_counter = False self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url self.max_concurrent_connections = max_concurrent_connections self.page_queue = JoinableQueue() self.semaphore = BoundedSemaphore(self.max_concurrent_connections) self.start = time.time() self.skip_count = 0