def _get_xmlns_values(self) -> Set[str]: values = { helpers.fix_possible_url(tag.attrib["xmlns"]) for tag in self._tree.iterfind(".[@xmlns]") } values |= { helpers.fix_possible_url(tag.attrib["xmlns"]) for tag in self._tree.iterfind(".//*[@xmlns]") } return values
def _get_base_url_from_html(self) -> str: tag = self._tree.find(".//base[@href]") if tag is not None: base_url = helpers.fix_possible_url(tag.attrib["href"]) return base_url if is_url(base_url) else "" return ""
def decode_proofpoint_v2(self) -> str: try: query_url = self.query_dict["u"][0] # In cases where this URL is encoded multiple times by Proofpoint, we need to replace the "-2D" first, # as that represents the "-" character that all of the other character encodings rely on. After this # character, it shouldn't matter the order in which the rest of the characters get replaced. possible_url = query_url.replace("-2D", "-") replacements = { "_": "/", "-26": "&", "-3A": ":", "-3D": "=", "-3F": "?", "-5F": "/" } for replace_encoded, replace_decoded in replacements.items(): possible_url = possible_url.replace(replace_encoded, replace_decoded) possible_url = helpers.fix_possible_url(possible_url) return possible_url if URL(possible_url).is_url else "" except KeyError: return ""
def _get_src_values(self) -> Set[str]: values = set() for tag in self._tree.iterfind(".//*[@src]"): if self.base_url: values.add(helpers.fix_possible_value(tag.attrib["src"])) else: values.add(helpers.fix_possible_url(tag.attrib["src"])) tag.attrib["src"] = "" return values
def _get_srcset_values(self) -> Set[str]: values = set() for tag in self._tree.iterfind(".//*[@srcset]"): value = helpers.fix_possible_url(tag.attrib["srcset"]) splits = value.split(",") values |= {s.strip().split(" ")[0] for s in splits} tag.attrib["srcset"] = "" return values
def find_urls(self, strict: bool = True, domain_as_url: bool = False) -> Set[str]: tok = tokenizer.UTF8Tokenizer(self.blob) token_iter = chain( tok.get_line_tokens(), tok.get_tokens_between_angle_brackets(strict=strict), tok.get_tokens_between_backticks(), tok.get_tokens_between_brackets(strict=strict), tok.get_tokens_between_curly_brackets(strict=strict), tok.get_tokens_between_double_quotes(), tok.get_tokens_between_parentheses(strict=strict), tok.get_tokens_between_single_quotes(), tok.get_sentences(), ) split_token_iter = tok.get_split_tokens_after_replace( ["<", ">", "`", "[", "]", "{", "}", '"', "'", "(", ")"]) if domain_as_url: tokens = set() for token in token_iter: if "." in token and "/" in token: tokens.add(token) continue if validators.domain(token): tokens.add(token) for token in split_token_iter: if "." in token and "/" in token: tokens.add(token) continue if validators.domain(token): tokens.add(token) else: tokens = {t for t in token_iter if "." in t and "/" in t} tokens |= {t for t in split_token_iter if "." in t and "/" in t} valid_urls = URLList() for token in tokens: # It is common for text files like email plaintext bodies to encode URLs in the form of: # http://domain.com<http://actualdomain.com> # where the text at the beginning is what will be displayed, and the text inside the <> is the # actual URL you will be taken to if you click on it. In these cases, we don't want that entire string # to be considered as a valid URL, but would rather have each of them as separate URLs. if "<" in token and token.endswith(">"): continue valid_urls.append( helpers.fix_possible_url(token, domain_as_url=domain_as_url)) return set(valid_urls)
def decode_mandrillapp(self) -> str: base64_string = self.query_dict["p"][0].replace("_", "/") decoded = base64.b64decode(f"{base64_string}===") try: outer_json = json.loads(decoded) inner_json = json.loads(outer_json["p"]) possible_url = helpers.fix_possible_url(inner_json["url"]) return possible_url if URL(possible_url).is_url else "" except json.JSONDecodeError: return "" except UnicodeDecodeError: return ""
def decode_proofpoint_v3(self) -> str: try: match = re.search(r"v3/__(.+?)__;(.*?)!", self.value, re.IGNORECASE) embedded_url = match.group(1) base64_characters = match.group(2) decoded_characters = base64.urlsafe_b64decode( f"{base64_characters}===").decode("utf-8") for i in range(len(decoded_characters)): embedded_url = embedded_url.replace("*", decoded_characters[i], 1) embedded_url = helpers.fix_possible_url(embedded_url) return embedded_url if URL(embedded_url).is_url else "" except AttributeError: return ""
def find_urls(self) -> Set[str]: valid_urls = URLList() for document_write_url in self._find_document_write_urls(): valid_urls.append(document_write_url) for window_location_url in self._get_window_location_href(): valid_urls.append(helpers.fix_possible_url(window_location_url)) for visible_url in self._find_visible_urls(): valid_urls.append(visible_url) for meta_refresh_value in self._get_meta_refresh_values(): valid_urls.append(meta_refresh_value) possible_urls = set() possible_urls |= { urljoin(self.base_url, u) for u in self._get_base_url_eligible_values() } srcset_values = self._get_srcset_values() possible_urls = { u for u in possible_urls if not any(srcset_value in u for srcset_value in srcset_values) } possible_urls |= {urljoin(self._base_url, u) for u in srcset_values} possible_urls |= self._get_tag_attribute_values() for possible_url in possible_urls: valid_urls.append(helpers.fix_possible_url(possible_url)) tok = tokenizer.UTF8Tokenizer(self.tree_string) # TODO: itertools.product(*zip(string.lower(), string.upper())) token_iter = chain( tok.get_tokens_between_open_and_close_sequence('"http', '"', strict=True), tok.get_tokens_between_open_and_close_sequence('"ftp', '"', strict=True), tok.get_tokens_between_open_and_close_sequence("'http", "'", strict=True), tok.get_tokens_between_open_and_close_sequence("'ftp", "'", strict=True), tok.get_tokens_between_open_and_close_sequence('"HTTP', '"', strict=True), tok.get_tokens_between_open_and_close_sequence('"FTP', '"', strict=True), tok.get_tokens_between_open_and_close_sequence("'HTTP", "'", strict=True), tok.get_tokens_between_open_and_close_sequence("'FTP", "'", strict=True), ) for token in token_iter: valid_urls.append(token) return set(valid_urls)
def test_fix_possible_url(): assert helpers.fix_possible_url( "//domain.com\\index\u0000.html") == "https://domain.com/index.html"