示例#1
0
def spider(init, max=-1, ignore_qs=False, post_func=None, hosts=None):
    """
  Spider a request by following some links.

  init    - The initial request(s)
  max       - The maximum of request to execute
  post_func - A hook to be executed after each new page fetched
  hosts     - A lists of authorised hosts to spider on. By default
              only the hostname of r_init is allowed.
  """
    nb = 0
    checked = []
    if isinstance(init, Request):
        q = deque([
            init,
        ])
        hs = [
            init.hostname,
        ]
    elif isinstance(init, RequestSet):
        q = deque(init)
        hs = list(set(init.extract("hostname")))
    else:
        raise TypeError("init must be a Request or a RequestSet")
    if hosts:
        hs += hosts
    try:
        while nb != max and q:
            to_add = []
            r = q.popleft()
            print str(len(checked)) + "/" + str(len(q)),
            clear_line()
            if not r.response:
                r()
            if r.response.content_type:
                if re.match(r'text/html', r.response.content_type):
                    to_add += _follow_redirect(r)
                    to_add += _get_links(r)
                else:
                    print "\nIgnoring", r.response.content_type
            checked.append(r)
            if post_func:
                post_func(r)
            for nr in to_add:
                if nr.hostname not in hs:
                    continue
                if not ignore_qs and any(nr == rc for rc in checked + list(q)):
                    continue
                if ignore_qs and any(
                        nr.similar(rc) for rc in checked + list(q)):
                    continue
                q.append(nr)
            nb += 1
    except KeyboardInterrupt:
        print str(len(checked)) + "/" + str(len(q))
    return RequestSet(checked)
示例#2
0
 def __call__(self,
              force=False,
              randomised=False,
              verbose=1,
              post_func=None,
              post_args=[]):
     if not self.reqs:
         raise Exception("No request to proceed")
     hostnames = set([r.hostname for r in self.reqs])
     ports = set([r.port for r in self.reqs])
     use_ssls = set([r.use_ssl for r in self.reqs])
     if len(hostnames) > 1 or len(ports) > 1 or len(use_ssls) > 1:
         raise Exception("Only one host per request set to run it")
     self.hostname = hostnames.pop()
     self.port = ports.pop()
     self.use_ssl = use_ssls.pop()
     if force and verbose:
         print "Clearing previous responses..."
         self.clear()
     conn = self._init_connection()
     if verbose:
         print "Running {} requests...".format(len(self.reqs)),
         clear_line()
     indices = range(len(self.reqs))
     if randomised: random.shuffle(indices)
     done = 0
     todo = len(self.reqs)
     for i in indices:
         r = self.reqs[i]
         if verbose:
             print "Running {} requests...{:.2f}%".format(
                 todo, done * 100. / todo),
             clear_line()
         next = False
         if r.response and not force:
             todo -= 1
             next = True
         while not next:
             try:
                 if verbose == 2: print repr(r)
                 r(conn=conn)
                 if post_func: post_func(r, *post_args)
                 if verbose == 2: print repr(r.response)
                 if r.response.closed:
                     conn = self._init_connection()
                 done += 1
                 next = True
             except (socket.error, BadStatusLine):
                 conn = self._init_connection()
                 next = False
             if conf.delay:
                 time.sleep(conf.delay)
     if verbose:
         print "Running {} requests...done.".format(len(self.reqs))
     conn.close()
示例#3
0
def spider(init, max=-1, ignore_qs=False, post_func=None, hosts=None):
  """
  Spider a request by following some links.

  init    - The initial request(s)
  max       - The maximum of request to execute
  post_func - A hook to be executed after each new page fetched
  hosts     - A lists of authorised hosts to spider on. By default
              only the hostname of r_init is allowed.
  """
  nb = 0
  checked = []
  if isinstance(init, Request):
    q = deque([init, ])
    hs = [ init.hostname, ]
  elif isinstance(init, RequestSet):
    q = deque(init)
    hs = list(set(init.extract("hostname")))
  else:
    raise TypeError("init must be a Request or a RequestSet")
  if hosts:
    hs += hosts
  try:
    while nb != max and q:
      to_add = []
      r = q.popleft()
      print str(len(checked)) + "/" + str(len(q)),
      clear_line()
      if not r.response:
        r()
      if r.response.content_type:
        if re.match(r'text/html', r.response.content_type):
          to_add += _follow_redirect(r)
          to_add += _get_links(r)
        else:
          print "\nIgnoring", r.response.content_type
      checked.append(r)
      if post_func:
        post_func(r)
      for nr in to_add:
        if nr.hostname not in hs:
          continue
        if not ignore_qs and any(nr == rc for rc in checked + list(q)):
          continue
        if ignore_qs and any(nr.similar(rc) for rc in checked + list(q)):
          continue
        q.append(nr)
      nb += 1
  except KeyboardInterrupt:
    print str(len(checked)) + "/" + str(len(q))
  return RequestSet(checked)
示例#4
0
 def __call__(self, force=False, randomised=False, verbose=1,
              post_func=None, post_args=[]):
   if not self.reqs:
     raise Exception("No request to proceed")
   hostnames = set([r.hostname for r in self.reqs])
   ports = set([r.port for r in self.reqs])
   use_ssls = set([r.use_ssl for r in self.reqs])
   if len(hostnames) > 1 or len(ports) > 1 or len(use_ssls) > 1:
     raise Exception("Only one host per request set to run it")
   self.hostname = hostnames.pop()
   self.port = ports.pop()
   self.use_ssl = use_ssls.pop()
   if force and verbose:
     print "Clearing previous responses..."
     self.clear()
   conn = self._init_connection()
   if verbose:
     print "Running {} requests...".format(len(self.reqs)),
     clear_line()
   indices = range(len(self.reqs))
   if randomised: random.shuffle(indices)
   done = 0
   todo = len(self.reqs)
   for i in indices:
     r = self.reqs[i]
     if verbose:
       print "Running {} requests...{:.2f}%".format(todo, done * 100. / todo),
       clear_line()
     next = False
     if r.response and not force:
       todo -= 1
       next = True
     while not next:
       try:
         if verbose == 2: print repr(r)
         r(conn=conn)
         if post_func: post_func(r, *post_args)
         if verbose == 2: print repr(r.response)
         if r.response.closed:
           conn = self._init_connection()
         done += 1
         next = True
       except (socket.error, BadStatusLine):
         conn = self._init_connection()
         next = False
       if conf.delay:
         time.sleep(conf.delay)
   if verbose:
     print "Running {} requests...done.".format(len(self.reqs))
   conn.close()
示例#5
0
def spider(r_init, max=-1, post_func=None, hosts=None):
  """
  Spider a request by following some links.

  r_init    - The initial request
  max       - The maximum of request to execute
  post_func - A hook to be executed after each new page fetched
  hosts     - A lists of authorised hosts to spider on. By default,
              only the hostname of r_init is allowed.
  """
  q = deque([r_init, ])
  checked = []
  nb = 0
  hs = [r_init.hostname, ]
  if hosts:
    hs += hosts
  try:
    while nb != max and q:
      to_add = []
      r = q.popleft()
      print str(len(checked)) + "/" + str(len(q)),
      clear_line()
      r()
      if r.response.content_type:
        if re.match(r'text/html', r.response.content_type):
          to_add += _follow_redirect(r)
          to_add += _get_links(r)
        else:
          print "\nIgnoring", r.response.content_type
      checked.append(r)
      if post_func:
        post_func(r)
      for nr in to_add:
        if nr.hostname not in hs:
          continue
        if any(nr == rc for rc in checked + list(q)):
          continue
        q.append(nr)
      nb += 1
  except KeyboardInterrupt:
    pass
  return RequestSet(checked)