Exemplo n.º 1
0
 def handle_data(self, data):
     if self.inscript:
         self.liens.extend(lamejs.lamejs(data).getLinks())
         candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data)
         candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data)
         for jstr in candidates:
             if ('/' in jstr or '.' in jstr or '?' in jstr) and jstr not in self.common_js_strings:
                 self.liens.append(jstr)
Exemplo n.º 2
0
 def handle_data(self, data):
     if self.inscript:
         self.liens.extend(lamejs.lamejs(data).getLinks())
         candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data)
         candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data)
         for jstr in candidates:
             if ('/' in jstr or '.' in jstr
                     or '?' in jstr) and jstr not in self.common_js_strings:
                 self.liens.append(jstr)
Exemplo n.º 3
0
 def handle_data(self, data):
     if self.inscript:
         allowed_ext = [".php", ".asp", ".xml", ".js", ".json", ".jsp"]
         self.liens.extend(lamejs.lamejs(data).getLinks())
         candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data)
         candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data)
         for jstr in candidates:
             if jstr not in self.common_js_strings:
                 for ext in allowed_ext:
                     if ext in jstr:
                         self.liens.append(jstr)
Exemplo n.º 4
0
 def handle_data(self, data):
     if self.inscript:
         allowed_ext = [".php", ".asp", ".xml", ".js", ".json", ".jsp"]
         self.liens.extend(lamejs.lamejs(data).getLinks())
         candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data)
         candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data)
         for jstr in candidates:
             if jstr not in self.common_js_strings:
                 for ext in allowed_ext:
                     if ext in jstr:
                         self.liens.append(jstr)
Exemplo n.º 5
0
    def handle_starttag(self, tag, attrs):
        tmpdict = {}
        for k, v in attrs:
            if v is None:
                continue
            lk = k.lower()
            if not lk in tmpdict:
                tmpdict[lk] = v
                if lk in self.js_events:
                    self.liens.extend(lamejs.lamejs(v).getLinks())

        if tag.lower() in ['a', 'link']:
            if "href" in tmpdict:
                if tmpdict['href'].lower().startswith("javascript:"):
                    self.liens.extend(lamejs.lamejs(tmpdict["href"].split(':', 1)[1]).getLinks())
                else:
                    self.liens.append(tmpdict['href'])

        if tag.lower() == 'form':
            self.inform = 1
            self.form_values = []
            self.current_form_url = self.url
            if "action" in tmpdict:
                if tmpdict['action'].lower().startswith("javascript"):
                    self.liens.extend(lamejs.lamejs(tmpdict["action"].split(':', 1)[1]).getLinks())
                self.liens.append(tmpdict['action'])
                self.current_form_url = tmpdict['action']

            # Forms use GET method by default
            self.current_form_method = "get"
            if "method" in tmpdict:
                if tmpdict["method"].lower() == "post":
                    self.current_form_method = "post"

        if tag.lower() == 'input':
            if self.inform == 1:
                if "type" not in tmpdict:
                    tmpdict["type"] = "text"
                if "name" in tmpdict:
                    if tmpdict['type'].lower() in self.__defaults:
                        # use the value from the form or use our default value
                        if "value" in tmpdict and tmpdict["value"] != "":
                            val = tmpdict["value"]
                        else:
                            val = self.__defaults[tmpdict['type'].lower()]
                        self.form_values.append([tmpdict['name'], val])

                    if tmpdict['type'].lower() == "image":
                        self.form_values.append([tmpdict['name'] + ".x", "1"])
                        self.form_values.append([tmpdict['name'] + ".y", "1"])

            if "formaction" in tmpdict:
                self.liens.append(tmpdict['formaction'])

        if tag.lower() in ["textarea", "select"]:
            if self.inform == 1:
                if "name" in tmpdict:
                    self.form_values.append([tmpdict['name'], u'on'])

        if tag.lower() in ["frame", "iframe"]:
            if "src" in tmpdict:
                self.liens.append(tmpdict['src'])

        if tag.lower() in ["img", "embed", "track", "source"]:
            if "src" in tmpdict:
                if "?" in tmpdict['src'] or tmpdict['src'].endswith(".swf"):
                    self.liens.append(tmpdict['src'])

        if tag.lower() == "script":
            self.inscript = 1
            if "src" in tmpdict:
                # if "?" in tmpdict['src']:
                self.liens.append(tmpdict['src'])

        if tag.lower() == "meta":
            if "http-equiv" in tmpdict and "content" in tmpdict:
                if tmpdict["http-equiv"].lower() == "refresh":
                    content_str = tmpdict["content"].lower()
                    url_eq_idx = content_str.find("url=")
                    if url_eq_idx >= 0:
                        self.liens.append(tmpdict["content"][url_eq_idx + 4:])
Exemplo n.º 6
0
                    # Mismatch ! Convert the response text to the encoding detected by BeautifulSoup
                    resp.setEncoding(page_encoding)
            else:
                page_encoding = resp_encoding
            data = resp.getPage()
        else:
            # Can't find an encoding... beware of non-html content
            data = resp.getRawPage()
            if "application/x-shockwave-flash" in mime_type or web_resource.file_ext == "swf":
                try:
                    flash_parser = swf_parser.swf_parser(data)
                    swf_links = flash_parser.getLinks()
                except Exception, err_data:
                    swf_links = err_data[1]
            elif "/x-javascript" in mime_type or "/x-js" in mime_type or "/javascript" in mime_type:
                js_links = lamejs.lamejs(data).getLinks()
            data = ""

        # Manage redirections
        if "location" in info:
            redir = self.correctlink(info["location"], current, current_full_url, currentdir, proto, None)
            if redir is not None:
                if self.__inzone(redir) == 0:
                    self.link_encoding[redir] = self.link_encoding[url]
                    redir = HTTP.HTTPResource(redir, link_depth=current_depth+1)
                    # Is the document not visited yet and not forbidden ?
                    if (redir not in self.browsed_links and
                        redir not in self.tobrowse and
                            not self.isExcluded(redir)):
                        self.tobrowse.append(redir)
Exemplo n.º 7
0
    def handle_starttag(self, tag, attrs):
        tmpdict = {}
        for k, v in attrs:
            if v is None:
                continue
            lk = k.lower()
            if not lk in tmpdict:
                tmpdict[lk] = v
                if lk in self.js_events:
                    self.liens.extend(lamejs.lamejs(v).getLinks())

        if tag.lower() in ['a', 'link']:
            if "href" in tmpdict:
                if tmpdict['href'].lower().startswith("javascript:"):
                    self.liens.extend(lamejs.lamejs(tmpdict["href"].split(':', 1)[1]).getLinks())
                else:
                    self.liens.append(tmpdict['href'])

        if tag.lower() == 'form':
            self.inform = 1
            self.form_values = []
            self.current_form_url = self.url
            if "action" in tmpdict:
                if tmpdict['action'].lower().startswith("javascript"):
                    self.liens.extend(lamejs.lamejs(tmpdict["action"].split(':', 1)[1]).getLinks())
                self.liens.append(tmpdict['action'])
                self.current_form_url = tmpdict['action']

            # Forms use GET method by default
            self.current_form_method = "get"
            if "method" in tmpdict:
                if tmpdict["method"].lower() == "post":
                    self.current_form_method = "post"

        if tag.lower() == 'input':
            if self.inform == 1:
                if "type" not in tmpdict:
                    tmpdict["type"] = "text"
                if "name" in tmpdict:
                    if tmpdict['type'].lower() in self.__defaults:
                        # use the value from the form or use our default value
                        if "value" in tmpdict and tmpdict["value"] != "":
                            val = tmpdict["value"]
                        else:
                            val = self.__defaults[tmpdict['type'].lower()]
                        self.form_values.append([tmpdict['name'], val])

                    if tmpdict['type'].lower() == "image":
                        self.form_values.append([tmpdict['name'] + ".x", "1"])
                        self.form_values.append([tmpdict['name'] + ".y", "1"])

            if "formaction" in tmpdict:
                self.liens.append(tmpdict['formaction'])

        if tag.lower() in ["textarea", "select"]:
            if self.inform == 1:
                if "name" in tmpdict:
                    self.form_values.append([tmpdict['name'], u'on'])

        if tag.lower() in ["frame", "iframe"]:
            if "src" in tmpdict:
                self.liens.append(tmpdict['src'])

        if tag.lower() in ["img", "embed", "track", "source"]:
            if "src" in tmpdict:
                if "?" in tmpdict['src'] or tmpdict['src'].endswith(".swf"):
                    self.liens.append(tmpdict['src'])

        if tag.lower() == "script":
            self.inscript = 1
            if "src" in tmpdict:
                # if "?" in tmpdict['src']:
                self.liens.append(tmpdict['src'])

        if tag.lower() == "meta":
            if "http-equiv" in tmpdict and "content" in tmpdict:
                if tmpdict["http-equiv"].lower() == "refresh":
                    content_str = tmpdict["content"].lower()
                    url_eq_idx = content_str.find("url=")
                    if url_eq_idx >= 0:
                        self.liens.append(tmpdict["content"][url_eq_idx + 4:])
Exemplo n.º 8
0
                    # Mismatch ! Convert the response text to the encoding detected by BeautifulSoup
                    resp.setEncoding(page_encoding)
            else:
                page_encoding = resp_encoding
            data = resp.getPage()
        else:
            # Can't find an encoding... beware of non-html content
            data = resp.getRawPage()
            if "application/x-shockwave-flash" in mime_type or web_resource.file_ext == "swf":
                try:
                    flash_parser = swf_parser.swf_parser(data)
                    swf_links = flash_parser.getLinks()
                except Exception, err_data:
                    swf_links = err_data[1]
            elif "/x-javascript" in mime_type or "/x-js" in mime_type or "/javascript" in mime_type:
                js_links = lamejs.lamejs(data).getLinks()
            data = ""

        # Manage redirections
        if "location" in info:
            redir = self.correctlink(info["location"], current, current_full_url, currentdir, proto, None)
            if redir is not None:
                if self.__inzone(redir) == 0:
                    self.link_encoding[redir] = self.link_encoding[url]
                    redir = HTTP.HTTPResource(redir, link_depth=current_depth+1)
                    # Is the document not visited yet and not forbidden ?
                    if (redir not in self.browsed_links and
                        redir not in self.tobrowse and
                            not self.isExcluded(redir)):
                        self.tobrowse.append(redir)