def spider(init, max=-1, ignore_qs=False, post_func=None, hosts=None): """ Spider a request by following some links. init - The initial request(s) max - The maximum of request to execute post_func - A hook to be executed after each new page fetched hosts - A lists of authorised hosts to spider on. By default only the hostname of r_init is allowed. """ nb = 0 checked = [] if isinstance(init, Request): q = deque([ init, ]) hs = [ init.hostname, ] elif isinstance(init, RequestSet): q = deque(init) hs = list(set(init.extract("hostname"))) else: raise TypeError("init must be a Request or a RequestSet") if hosts: hs += hosts try: while nb != max and q: to_add = [] r = q.popleft() print str(len(checked)) + "/" + str(len(q)), clear_line() if not r.response: r() if r.response.content_type: if re.match(r'text/html', r.response.content_type): to_add += _follow_redirect(r) to_add += _get_links(r) else: print "\nIgnoring", r.response.content_type checked.append(r) if post_func: post_func(r) for nr in to_add: if nr.hostname not in hs: continue if not ignore_qs and any(nr == rc for rc in checked + list(q)): continue if ignore_qs and any( nr.similar(rc) for rc in checked + list(q)): continue q.append(nr) nb += 1 except KeyboardInterrupt: print str(len(checked)) + "/" + str(len(q)) return RequestSet(checked)
def __call__(self, force=False, randomised=False, verbose=1, post_func=None, post_args=[]): if not self.reqs: raise Exception("No request to proceed") hostnames = set([r.hostname for r in self.reqs]) ports = set([r.port for r in self.reqs]) use_ssls = set([r.use_ssl for r in self.reqs]) if len(hostnames) > 1 or len(ports) > 1 or len(use_ssls) > 1: raise Exception("Only one host per request set to run it") self.hostname = hostnames.pop() self.port = ports.pop() self.use_ssl = use_ssls.pop() if force and verbose: print "Clearing previous responses..." self.clear() conn = self._init_connection() if verbose: print "Running {} requests...".format(len(self.reqs)), clear_line() indices = range(len(self.reqs)) if randomised: random.shuffle(indices) done = 0 todo = len(self.reqs) for i in indices: r = self.reqs[i] if verbose: print "Running {} requests...{:.2f}%".format( todo, done * 100. / todo), clear_line() next = False if r.response and not force: todo -= 1 next = True while not next: try: if verbose == 2: print repr(r) r(conn=conn) if post_func: post_func(r, *post_args) if verbose == 2: print repr(r.response) if r.response.closed: conn = self._init_connection() done += 1 next = True except (socket.error, BadStatusLine): conn = self._init_connection() next = False if conf.delay: time.sleep(conf.delay) if verbose: print "Running {} requests...done.".format(len(self.reqs)) conn.close()
def spider(init, max=-1, ignore_qs=False, post_func=None, hosts=None): """ Spider a request by following some links. init - The initial request(s) max - The maximum of request to execute post_func - A hook to be executed after each new page fetched hosts - A lists of authorised hosts to spider on. By default only the hostname of r_init is allowed. """ nb = 0 checked = [] if isinstance(init, Request): q = deque([init, ]) hs = [ init.hostname, ] elif isinstance(init, RequestSet): q = deque(init) hs = list(set(init.extract("hostname"))) else: raise TypeError("init must be a Request or a RequestSet") if hosts: hs += hosts try: while nb != max and q: to_add = [] r = q.popleft() print str(len(checked)) + "/" + str(len(q)), clear_line() if not r.response: r() if r.response.content_type: if re.match(r'text/html', r.response.content_type): to_add += _follow_redirect(r) to_add += _get_links(r) else: print "\nIgnoring", r.response.content_type checked.append(r) if post_func: post_func(r) for nr in to_add: if nr.hostname not in hs: continue if not ignore_qs and any(nr == rc for rc in checked + list(q)): continue if ignore_qs and any(nr.similar(rc) for rc in checked + list(q)): continue q.append(nr) nb += 1 except KeyboardInterrupt: print str(len(checked)) + "/" + str(len(q)) return RequestSet(checked)
def __call__(self, force=False, randomised=False, verbose=1, post_func=None, post_args=[]): if not self.reqs: raise Exception("No request to proceed") hostnames = set([r.hostname for r in self.reqs]) ports = set([r.port for r in self.reqs]) use_ssls = set([r.use_ssl for r in self.reqs]) if len(hostnames) > 1 or len(ports) > 1 or len(use_ssls) > 1: raise Exception("Only one host per request set to run it") self.hostname = hostnames.pop() self.port = ports.pop() self.use_ssl = use_ssls.pop() if force and verbose: print "Clearing previous responses..." self.clear() conn = self._init_connection() if verbose: print "Running {} requests...".format(len(self.reqs)), clear_line() indices = range(len(self.reqs)) if randomised: random.shuffle(indices) done = 0 todo = len(self.reqs) for i in indices: r = self.reqs[i] if verbose: print "Running {} requests...{:.2f}%".format(todo, done * 100. / todo), clear_line() next = False if r.response and not force: todo -= 1 next = True while not next: try: if verbose == 2: print repr(r) r(conn=conn) if post_func: post_func(r, *post_args) if verbose == 2: print repr(r.response) if r.response.closed: conn = self._init_connection() done += 1 next = True except (socket.error, BadStatusLine): conn = self._init_connection() next = False if conf.delay: time.sleep(conf.delay) if verbose: print "Running {} requests...done.".format(len(self.reqs)) conn.close()
def spider(r_init, max=-1, post_func=None, hosts=None): """ Spider a request by following some links. r_init - The initial request max - The maximum of request to execute post_func - A hook to be executed after each new page fetched hosts - A lists of authorised hosts to spider on. By default, only the hostname of r_init is allowed. """ q = deque([r_init, ]) checked = [] nb = 0 hs = [r_init.hostname, ] if hosts: hs += hosts try: while nb != max and q: to_add = [] r = q.popleft() print str(len(checked)) + "/" + str(len(q)), clear_line() r() if r.response.content_type: if re.match(r'text/html', r.response.content_type): to_add += _follow_redirect(r) to_add += _get_links(r) else: print "\nIgnoring", r.response.content_type checked.append(r) if post_func: post_func(r) for nr in to_add: if nr.hostname not in hs: continue if any(nr == rc for rc in checked + list(q)): continue q.append(nr) nb += 1 except KeyboardInterrupt: pass return RequestSet(checked)