예제 #1
0
 def setUp(self):
     self.html = '''
         <form>
             <input name="vocals" />
             <input name="guitar" type="file" />
             <select name="drums">
                 <option value="roger">Roger<br />
                 <option value="john">John<br />
             </select>
             <input type="radio" name="bass" value="Roger">Roger<br />
             <input type="radio" name="bass" value="John">John<br />
         </form>
     '''
     self.form = Form(self.html)
예제 #2
0
class TestForm(unittest.TestCase):

    def setUp(self):
        self.html = '''
            <form>
                <input name="vocals" />
                <input name="guitar" type="file" />
                <select name="drums">
                    <option value="roger">Roger<br />
                    <option value="john">John<br />
                </select>
                <input type="radio" name="bass" value="Roger">Roger<br />
                <input type="radio" name="bass" value="John">John<br />
            </form>
        '''
        self.form = Form(self.html)

    def test_fields(self):
        keys = set(('vocals', 'guitar', 'drums', 'bass'))
        assert_equal(set(self.form.fields.keys()), keys)
        assert_equal(set(self.form.keys()), keys)
예제 #3
0
 def parse_html(self, body_html, base_url):
     parsed = BeautifulSoup(body_html, 'html.parser')
     forms = list()
     for form in parsed.findAll('form'):
         f = Form(form)
         # RoboBrowser does not handle submit button
         for field in form.find_all('button'):
             # If not name, continue
             if not field.attrs.get('name'):
                 continue
             if field.attrs.get('type') == "submit":
                 f.add_field(BaseField(field))
         if not f.action:
             f.action = base_url
         elif not re_search("^https?://", f.action):
             f.action = urljoin(base_url, f.action)
         forms.append(f)
     return forms
예제 #4
0
def fetch_forms(logger, uris, req, sso_vulture_agent, headers=dict(), ssl_context=None, headers_in=None, proxy_client_side_certificate=None):
    """ Fetch forms inside an html page
    :param logger: logger instance
    :param uri: The 'action' uri where to post the form
    :param req: The request from the user browser, used to get referer, user-agent...
    :param sso_vulture_agent: A Boolean telling if we have to use the Vulture User-Agent or the browser User-Agent
    :param headers: Optional dict that contains headers to send in the request
    :returns: Mechanize instance, final URI string, response's Set-Cookie string, dict with response elements
    """

    if sso_vulture_agent:
        ua = vulture_custom_agent
    else:
        try:
            ua = req.META['HTTP_USER_AGENT']
        except:
            ua = vulture_custom_agent
            pass

    #request = urllib2.Request(uri)
    verify_certificate = False
    session = requests.Session()
    if ssl_context is not None:
        # requests version 2.18.1 needed for the following line
        session.mount("https://", SSLAdapter(ssl_context.protocol))
        if ssl_context.verify_mode != ssl.CERT_NONE:
            verify_certificate = "/home/vlt-sys/Engine/conf/certs/"

    if not proxy_client_side_certificate or not ssl_context:
        proxy_client_side_certificate = None

    #request.add_header('User-Agent', ua)
    session.headers.update({'User-Agent': ua})

    """ Add Request Header, if any defined in Application config """
    try:
        for header in headers_in:
            if header.action in ('set', 'add'):
                #request.add_header (header.name, header.value)
                session.headers.update({header.name: header.value})
    except:
        pass

    for k, v in headers.items():
        #request.add_header(k,v)
        session.headers.update({k: v})

    try:
        # if ssl_context:
        #     response = urllib2.urlopen(request, context=ssl_context)
        # else:
        #     opener = urllib2.build_opener()
        #     response = opener.open(request)

        # response_body=response.read()
        response = None
        response_body = ""
        for uri in uris:
            try:
                response = session.get(uri, verify=verify_certificate, cert=proxy_client_side_certificate)
                response_body = response.content
                break
            except Exception as e:
                logger.error("FETCH_FORMS::Exception while getting uri '{}' : {}".format(uri, e))

        if response is None:
            logger.error("FETCH_FORMS::No url could be fetched among the following list : {}".format(uris))
            return None, None, None, None

        # No encoding in Python3
        # try:
        #     if response.encoding.lower() != "utf-8":
        #         response_body = response.content.encode('utf-8')
        # except Exception as e:
        #     logger.error("FETCH_FORMS::Exception while trying to encode response content : {}".format(e))

    # except urllib2.HTTPError, e:
    #     if e.code == 401 and e.reason == "Authorization Required":
    #         return None, None, None, None
        if not response_body:
        #if response.status_code == 401 and response.reason == "Unauthorized":
            return None, None, None, None
    except Exception as e:
        logger.error("FETCH_FORMS::Exception requesting {} : {}".format(str(uri), str(e)))
        return None, None, None, None

    try:

        """ Check if we have to follow a meta redirect (301/302 are already handled by urllib2) """
        redirect_re = re.compile(b'<meta[^>]*?url=\s*(.*?)["\']', re.IGNORECASE)
        match = redirect_re.search(response_body)
        if match:
            uri = match.groups()[0].strip()
            #request = urllib2.Request(uri)
            session = requests.Session()
            #request.add_header('User-Agent', ua)
            session.headers.update({'User-Agent': ua})

            """ Add Request Header, if any defined in Application config """
            try:
                for header in headers_in:
                    if header.action in ('set', 'add'):
                        #request.add_header (header.name, header.value)
                        session.headers.update({header.name: headers.value})
            except:
                pass

            for k, v in headers.items():
                #request.add_header(k,v)
                session.headers.update({k: v})
            #response = opener.open(request)
            response = session.get(uri)
            #response_body=response.read()
            response_body = response.content
            if response.encoding.lower() != "utf-8":
                response_body = response.content.encode('utf-8')
    except Exception as e:
        logger.debug("fetch_forms Exception: " + str(e))
        return None, None, None, None

    # Parse response with BeautifulSoup and robobrowser => PYTHON 3
    parsed = BeautifulSoup(response_body, 'html.parser')
    resp = []
    for form in parsed.findAll('form'):
        f = Form(form)
        # RoboBrowser does not handle submit button
        for field in form.find_all('button'):
            # If not name, continue
            if not field.attrs.get('name'):
                continue
            if field.attrs.get('type') == "submit":
                f.add_field(BaseField(field))
        resp.append(f)
    # Python 2 with OLD mechanize
    # resp = mechanize.ParseString(response_body, response.url) #, backwards_compat=False)

    return (resp, uri, response, response_body)