def setUp(self): self.html = ''' <form> <input name="vocals" /> <input name="guitar" type="file" /> <select name="drums"> <option value="roger">Roger<br /> <option value="john">John<br /> </select> <input type="radio" name="bass" value="Roger">Roger<br /> <input type="radio" name="bass" value="John">John<br /> </form> ''' self.form = Form(self.html)
class TestForm(unittest.TestCase): def setUp(self): self.html = ''' <form> <input name="vocals" /> <input name="guitar" type="file" /> <select name="drums"> <option value="roger">Roger<br /> <option value="john">John<br /> </select> <input type="radio" name="bass" value="Roger">Roger<br /> <input type="radio" name="bass" value="John">John<br /> </form> ''' self.form = Form(self.html) def test_fields(self): keys = set(('vocals', 'guitar', 'drums', 'bass')) assert_equal(set(self.form.fields.keys()), keys) assert_equal(set(self.form.keys()), keys)
def parse_html(self, body_html, base_url): parsed = BeautifulSoup(body_html, 'html.parser') forms = list() for form in parsed.findAll('form'): f = Form(form) # RoboBrowser does not handle submit button for field in form.find_all('button'): # If not name, continue if not field.attrs.get('name'): continue if field.attrs.get('type') == "submit": f.add_field(BaseField(field)) if not f.action: f.action = base_url elif not re_search("^https?://", f.action): f.action = urljoin(base_url, f.action) forms.append(f) return forms
def fetch_forms(logger, uris, req, sso_vulture_agent, headers=dict(), ssl_context=None, headers_in=None, proxy_client_side_certificate=None): """ Fetch forms inside an html page :param logger: logger instance :param uri: The 'action' uri where to post the form :param req: The request from the user browser, used to get referer, user-agent... :param sso_vulture_agent: A Boolean telling if we have to use the Vulture User-Agent or the browser User-Agent :param headers: Optional dict that contains headers to send in the request :returns: Mechanize instance, final URI string, response's Set-Cookie string, dict with response elements """ if sso_vulture_agent: ua = vulture_custom_agent else: try: ua = req.META['HTTP_USER_AGENT'] except: ua = vulture_custom_agent pass #request = urllib2.Request(uri) verify_certificate = False session = requests.Session() if ssl_context is not None: # requests version 2.18.1 needed for the following line session.mount("https://", SSLAdapter(ssl_context.protocol)) if ssl_context.verify_mode != ssl.CERT_NONE: verify_certificate = "/home/vlt-sys/Engine/conf/certs/" if not proxy_client_side_certificate or not ssl_context: proxy_client_side_certificate = None #request.add_header('User-Agent', ua) session.headers.update({'User-Agent': ua}) """ Add Request Header, if any defined in Application config """ try: for header in headers_in: if header.action in ('set', 'add'): #request.add_header (header.name, header.value) session.headers.update({header.name: header.value}) except: pass for k, v in headers.items(): #request.add_header(k,v) session.headers.update({k: v}) try: # if ssl_context: # response = urllib2.urlopen(request, context=ssl_context) # else: # opener = urllib2.build_opener() # response = opener.open(request) # response_body=response.read() response = None response_body = "" for uri in uris: try: response = session.get(uri, verify=verify_certificate, cert=proxy_client_side_certificate) response_body = response.content break except Exception as e: logger.error("FETCH_FORMS::Exception while getting uri '{}' : {}".format(uri, e)) if response is None: logger.error("FETCH_FORMS::No url could be fetched among the following list : {}".format(uris)) return None, None, None, None # No encoding in Python3 # try: # if response.encoding.lower() != "utf-8": # response_body = response.content.encode('utf-8') # except Exception as e: # logger.error("FETCH_FORMS::Exception while trying to encode response content : {}".format(e)) # except urllib2.HTTPError, e: # if e.code == 401 and e.reason == "Authorization Required": # return None, None, None, None if not response_body: #if response.status_code == 401 and response.reason == "Unauthorized": return None, None, None, None except Exception as e: logger.error("FETCH_FORMS::Exception requesting {} : {}".format(str(uri), str(e))) return None, None, None, None try: """ Check if we have to follow a meta redirect (301/302 are already handled by urllib2) """ redirect_re = re.compile(b'<meta[^>]*?url=\s*(.*?)["\']', re.IGNORECASE) match = redirect_re.search(response_body) if match: uri = match.groups()[0].strip() #request = urllib2.Request(uri) session = requests.Session() #request.add_header('User-Agent', ua) session.headers.update({'User-Agent': ua}) """ Add Request Header, if any defined in Application config """ try: for header in headers_in: if header.action in ('set', 'add'): #request.add_header (header.name, header.value) session.headers.update({header.name: headers.value}) except: pass for k, v in headers.items(): #request.add_header(k,v) session.headers.update({k: v}) #response = opener.open(request) response = session.get(uri) #response_body=response.read() response_body = response.content if response.encoding.lower() != "utf-8": response_body = response.content.encode('utf-8') except Exception as e: logger.debug("fetch_forms Exception: " + str(e)) return None, None, None, None # Parse response with BeautifulSoup and robobrowser => PYTHON 3 parsed = BeautifulSoup(response_body, 'html.parser') resp = [] for form in parsed.findAll('form'): f = Form(form) # RoboBrowser does not handle submit button for field in form.find_all('button'): # If not name, continue if not field.attrs.get('name'): continue if field.attrs.get('type') == "submit": f.add_field(BaseField(field)) resp.append(f) # Python 2 with OLD mechanize # resp = mechanize.ParseString(response_body, response.url) #, backwards_compat=False) return (resp, uri, response, response_body)