Python URL_REGEX примеры использования

Язык программирования: Python

Пространство имен/Пакет: lib.core.settings

Класс/Тип: URL_REGEX

Примеров на hotexamples.com: 6

Python URL_REGEX - 6 примеров найдено. Это лучшие примеры Python кода для lib.core.settings.URL_REGEX, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

match(5)

findall(2)

Основные методы

match (5)

findall (2)

Пример #1

Показать файл

Файл: search.py Проект: signedbytes/Zeus-Scanner

def extract_webcache_url(webcache_url, splitter="+"):
    webcache_url = unquote(webcache_url)
    webcache_regex = re.compile(r"cache:(.{,16})?:")
    data = webcache_regex.split(webcache_url)
    to_extract = data[2].split(splitter)
    extracted_to_test = to_extract[0]
    if URL_REGEX.match(extracted_to_test):
        return extracted_to_test
    return None

Пример #2

Показать файл

                    tor=opt.useTor, batch=opt.runInBatch
                )
                __run_attacks_main()

            except Exception as e:
                logger.exception(set_color(
                    "ran into exception '{}' and cannot continue, saved to current log file".format(e),
                    level=50
                ))
                request_issue_creation()
                pass

        # spider a given webpage for all available URL's
        elif opt.spiderWebSite:
            problem_identifiers = ["http://", "https://"]
            if not URL_REGEX.match(opt.spiderWebSite):
                err_msg = "URL did not match a true URL{}"
                if not any(m in opt.spiderWebSite for m in problem_identifiers):
                    err_msg = err_msg.format(" issue seems to be that http:// "
                                             "or https:// is not present in the URL")
                else:
                    err_msg = err_msg.format("")
                raise InvalidInputProvided(
                    err_msg
                )
            else:
                if URL_QUERY_REGEX.match(opt.spiderWebSite):
                    question_msg = (
                        "it is recommended to not use a URL that has a GET(query) parameter in it, "
                        "would you like to continue"
                    )

Пример #3

Показать файл

def search_multiple_pages(query, link_amount, verbose=False, **kwargs):

    def __config_proxy(proxy_string):
        proxy_type_schema = {
            "http": httplib2.socks.PROXY_TYPE_HTTP,
            "socks4": httplib2.socks.PROXY_TYPE_SOCKS4,
            "socks5": httplib2.socks.PROXY_TYPE_SOCKS5
        }
        proxy_type = get_proxy_type(proxy_string)[0]
        proxy_dict = proxy_string_to_dict(proxy_string)
        proxy_config = httplib2.ProxyInfo(
            proxy_type=proxy_type_schema[proxy_type],
            proxy_host="".join(proxy_dict.keys()),
            proxy_port="".join(proxy_dict.values())
        )
        return proxy_config

    proxy, agent = kwargs.get("proxy", None), kwargs.get("agent", None)

    if proxy is not None:
        if verbose:
            logger.debug(set_color(
                "configuring to use proxy '{}'...".format(proxy), level=10
            ))
        __config_proxy(proxy)

    if agent is not None:
        if verbose:
            logger.debug(set_color(
                "settings user-agent to '{}'...".format(agent), level=10
            ))

    logger.warning(set_color(
        "multiple pages will be searched using Google's API client, searches may be blocked after a certain "
        "amount of time...", level=30
    ))
    results, limit, found, index = set(), link_amount, 0, google_api.search(query, user_agent=agent, safe="on")
    try:
        while limit > 0:
            results.add(next(index))
            limit -= 1
            found += 1
    except Exception as e:
        if "Error 503" in str(e):
            logger.fatal(set_color(
                "Google is blocking the current IP address, dumping already found URL's...", level=50
            ))
            results = results
            pass

    retval = set()
    for url in results:
        if URL_REGEX.match(url) and URL_QUERY_REGEX.match(url):
            if verbose:
                logger.debug(set_color(
                    "found '{}'...".format(url), level=10
                ))
            retval.add(url)

    if len(retval) != 0:
        logger.info(set_color(
            "a total of {} links found out of requested {}...".format(
                len(retval), link_amount
            )
        ))
        write_to_log_file(list(retval), URL_LOG_PATH, "url-log-{}.log")
    else:
        logger.error(set_color(
            "unable to extract URL's from results...", level=40
        ))

Пример #4

Показать файл

def parse_search_results(
        query, url_to_search, verbose=False, **kwargs):
    """
      Parse a webpage from Google for URL's with a GET(query) parameter
    """
    splitter = "&amp;"
    retval = set()
    query_url = None

    proxy_string, user_agent = kwargs.get("proxy", None), kwargs.get("agent", None)

    if verbose:
        logger.debug(set_color(
            "checking for user-agent and proxy configuration...", level=10
        ))

    user_agent_info = "adjusting user-agent header to {}..."
    if user_agent is not DEFAULT_USER_AGENT:
        user_agent_info = user_agent_info.format(user_agent.strip())
    else:
        user_agent_info = user_agent_info.format("default user agent '{}'".format(DEFAULT_USER_AGENT))

    proxy_string_info = "setting proxy to {}..."
    if proxy_string is not None:
        proxy_string_info = proxy_string_info.format(
            ''.join(proxy_string.keys()) + "://" + ''.join(proxy_string.values()))
    else:
        proxy_string_info = "no proxy configuration detected..."

    headers = {
        "Connection": "close",
        "user-agent": user_agent
    }
    logger.info(set_color(
        "attempting to gather query URL..."
    ))
    try:
        query_url = get_urls(query, url_to_search, verbose=verbose, user_agent=user_agent, proxy=proxy_string)
    except Exception as e:
        if "WebDriverException" in str(e):
            logger.exception(set_color(
                "it seems that you exited the browser, please allow the browser "
                "to complete it's run so that Zeus can bypass captchas and API "
                "calls", level=50
            ))
        elif "'/usr/lib/firefoxdriver/webdriver.xpi'" in str(e):
            logger.fatal(set_color(
                "firefox was not found in the default location on your system, "
                "check your installation and make sure it is in /usr/lib, if you "
                "find it there, restart your system and try again...", level=50
            ))
        else:
            logger.exception(set_color(
                "{} failed to gather the URL from search engine, caught exception '{}' "
                "exception has been logged to current log file...".format(
                    os.path.basename(__file__), str(e).strip()), level=50)
            )
            request_issue_creation()
        shutdown()
    logger.info(set_color(
        "URL successfully gathered, searching for GET parameters..."
    ))

    logger.info(set_color(proxy_string_info))
    req = requests.get(query_url, proxies=proxy_string)
    logger.info(set_color(user_agent_info))
    req.headers.update(headers)
    found_urls = URL_REGEX.findall(req.text)
    url_skip_schema = ("maps.google", "play.google", "youtube")
    for urls in list(found_urls):
        for url in list(urls):
            url = unquote(url)
            if not any(u in url for u in url_skip_schema):
                if URL_QUERY_REGEX.match(url) and not any(l in url for l in URL_EXCLUDES):
                    if isinstance(url, unicode):
                        url = str(url).encode("utf-8")
                    if "webcache" in url:
                        logger.info(set_color(
                            "received webcache URL, extracting URL from webcache..."
                        ))
                        webcache_url = url
                        url = extract_webcache_url(webcache_url)
                        if url is None:
                            logger.warning(set_color(
                                "unable to extract url from given webcache URL '{}'...".format(
                                    webcache_url
                                ), level=30
                            ))
                    if verbose:
                        try:
                            logger.debug(set_color(
                                "found '{}'...".format(url.split(splitter)[0]), level=10
                            ))
                        except TypeError:
                            logger.debug(set_color(
                                "found '{}'...".format(str(url).split(splitter)[0]), level=10
                            ))
                        except AttributeError:
                            logger.debug(set_color(
                                "found '{}...".format(str(url)), level=10
                            ))
                    if url is not None:
                        retval.add(url.split("&amp;")[0])
    logger.info(set_color(
        "found a total of {} URL's with a GET parameter...".format(len(retval))
    ))
    if len(retval) != 0:
        write_to_log_file(retval, URL_LOG_PATH, "url-log-{}.log")
    else:
        logger.critical(set_color(
            "did not find any usable URL's with the given query '{}' "
            "using search engine '{}'...".format(query, url_to_search), level=50
        ))
        shutdown()
    return list(retval) if len(retval) != 0 else None

Пример #5

Показать файл

def search_multiple_pages(query, link_amount, verbose=False, **kwargs):
    """
    search multiple pages for a lot of links, this will not be done via Google
    """
    proxy = kwargs.get("proxy", None)
    agent = kwargs.get("agent", None)
    xforward = kwargs.get("xforward", False)
    batch = kwargs.get("batch", False)
    show_success = kwargs.get("show_success", False)
    attrib, desc = "a", "href"
    retval = set()
    search_engine = AUTHORIZED_SEARCH_ENGINES["search-results"]

    logger.warning(
        set_color("searching multiple pages will not be done on Google".format(
            search_engine),
                  level=30))

    if not parse_blacklist(query, BLACKLIST_FILE_PATH, batch=batch):
        shutdown()

    if not xforward:
        params = {"Connection": "close", "user-agent": agent}
    else:
        ip_list = (create_random_ip(), create_random_ip(), create_random_ip())
        params = {
            "Connection":
            "close",
            "user-agent":
            agent,
            "X-Forwarded-For":
            "{}, {}, {}".format(ip_list[0], ip_list[1], ip_list[2])
        }

    page_number = 1
    try:
        while len(retval) <= link_amount:
            if verbose:
                logger.debug(
                    set_color("searching page number {}".format(page_number),
                              level=10))
            if page_number % 10 == 0:
                logger.info(
                    set_color("currently on page {} of search results".format(
                        page_number)))
            page_request = requests.get(search_engine.format(
                page_number, query, page_number),
                                        params=params,
                                        proxies=proxy_string_to_dict(proxy))
            if page_request.status_code == 200:
                html_page = page_request.content
                soup = BeautifulSoup(html_page, "html.parser")
                if not NO_RESULTS_REGEX.findall(str(soup)):
                    for link in soup.findAll(attrib):
                        redirect = link.get(desc)
                        if redirect is not None:
                            if not any(ex in redirect for ex in URL_EXCLUDES):
                                if URL_REGEX.match(redirect):
                                    retval.add(redirect)
                    if page_number < MAX_PAGE_NUMBER:
                        page_number += 1
                    else:
                        logger.warning(
                            set_color("hit max page number {}".format(
                                MAX_PAGE_NUMBER),
                                      level=30))
                        break
                else:
                    logger.warning(
                        set_color("no more results found for given query '{}'".
                                  format(query),
                                  level=30))
                    break
    except KeyboardInterrupt:
        logger.error(
            set_color("user aborted, dumping already found URL(s)", level=40))
        write_to_log_file(retval, URL_LOG_PATH, URL_FILENAME)
        logger.info(
            set_color("found a total of {} URL(s)".format(len(retval)),
                      level=25))
        shutdown()
    except Exception as e:
        logger.exception(
            set_color("Zeus ran into an unexpected error '{}'".format(e),
                      level=50))
        request_issue_creation()
        shutdown()

    if len(retval) > 0:
        logger.info(
            set_color(
                "a total of {} URL(s) found out of the requested {}".format(
                    len(retval), link_amount),
                level=25))
        file_path = write_to_log_file(retval, URL_LOG_PATH, URL_FILENAME)
        if show_success:
            amount_of_urls = len(open(file_path).readlines())
            success_rate = calculate_success(amount_of_urls)
            logger.info(
                set_color("provided query has a {} success rate".format(
                    success_rate)))
        return list(retval)
    else:
        logger.warning(
            set_color(
                "did not find any links with given query '{}' writing to blacklist"
                .format(query),
                level=30))
        write_to_log_file(query, BLACKLIST_FILE_PATH, BLACKLIST_FILENAME)

Пример #6

Показать файл

def parse_search_results(query, url_to_search, verbose=False, **kwargs):
    """
      Parse a webpage from Google for URL's with a GET(query) parameter
    """
    possible_leftovers = URLParser(None).possible_leftovers
    splitter = "&amp;"
    retval = set()
    query_url = None

    parse_webcache, pull_all = kwargs.get("parse_webcache", False), kwargs.get(
        "pull_all", False)
    proxy_string, user_agent = kwargs.get("proxy",
                                          None), kwargs.get("agent", None)
    forward_for = kwargs.get("forward_for", False)
    tor = kwargs.get("tor", False)
    batch = kwargs.get("batch", False)
    show_success = kwargs.get("show_success", False)

    if verbose:
        logger.debug(set_color("parsing blacklist", level=10))
    parse_blacklist(query, BLACKLIST_FILE_PATH, batch=batch)

    if verbose:
        logger.debug(
            set_color("checking for user-agent and proxy configuration",
                      level=10))

    if not parse_webcache and "google" in url_to_search:
        logger.warning(
            set_color(
                "will not parse webcache URL's (to parse webcache pass -W)",
                level=30))
    if not pull_all:
        logger.warning(
            set_color(
                "only pulling URLs with GET(query) parameters (to pull all URL's pass -E)",
                level=30))

    user_agent_info = "adjusting user-agent header to {}"
    if user_agent is not DEFAULT_USER_AGENT:
        user_agent_info = user_agent_info.format(user_agent.strip())
    else:
        user_agent_info = user_agent_info.format(
            "default user agent '{}'".format(DEFAULT_USER_AGENT))

    proxy_string_info = "setting proxy to {}"
    if proxy_string is not None:
        proxy_string = proxy_string_to_dict(proxy_string)
        proxy_string_info = proxy_string_info.format(
            ''.join(proxy_string.keys()) + "://" +
            ''.join(proxy_string.values()))
    elif tor:
        proxy_string = proxy_string_to_dict("socks5://127.0.0.1:9050")
        proxy_string_info = proxy_string_info.format("tor proxy settings")
    else:
        proxy_string_info = "no proxy configuration detected"

    if forward_for:
        ip_to_use = (create_random_ip(), create_random_ip(),
                     create_random_ip())
        if verbose:
            logger.debug(
                set_color(
                    "random IP addresses generated for headers '{}'".format(
                        ip_to_use),
                    level=10))

        headers = {
            HTTP_HEADER.CONNECTION:
            "close",
            HTTP_HEADER.USER_AGENT:
            user_agent,
            HTTP_HEADER.X_FORWARDED_FOR:
            "{}, {}, {}".format(ip_to_use[0], ip_to_use[1], ip_to_use[2])
        }
    else:
        headers = {
            HTTP_HEADER.CONNECTION: "close",
            HTTP_HEADER.USER_AGENT: user_agent
        }
    logger.info(set_color("attempting to gather query URL"))
    try:
        query_url = get_urls(query,
                             url_to_search,
                             verbose=verbose,
                             user_agent=user_agent,
                             proxy=proxy_string,
                             tor=tor,
                             batch=batch,
                             xforward=forward_for)
    except Exception as e:
        if "'/usr/lib/firefoxdriver/webdriver.xpi'" in str(e):
            logger.fatal(
                set_color(
                    "firefox was not found in the default location on your system, "
                    "check your installation and make sure it is in /usr/lib, if you "
                    "find it there, restart your system and try again",
                    level=50))
        elif "connection refused" in str(e).lower():
            logger.fatal(
                set_color(
                    "there are to many sessions of firefox opened and selenium cannot "
                    "create a new one",
                    level=50))
            run_fix(
                "would you like to attempt to auto clean the open sessions",
                "sudo sh {}".format(CLEANUP_TOOL_PATH),
                "kill off the open sessions of firefox and re-run Zeus",
                exit_process=True)
        elif "Program install error!" in str(e):
            logger.error(
                set_color(
                    "seems the program is having some trouble installing would you like "
                    "to try and automatically fix this issue",
                    level=40))
            run_fix(
                "would you like to attempt to fix this issue automatically",
                "sudo sh {}".format(FIX_PROGRAM_INSTALL_PATH),
                "you can manually try and re-install Xvfb to fix the problem",
                exit_process=True)
        elif "Message: Reached error page:" in str(e):
            logger.fatal(
                set_color(
                    "geckodriver has hit an error that usually means it needs to be reinstalled",
                    level=50))
            question = prompt(
                "would you like to attempt a reinstallation of the geckodriver",
                opts="yN")
            if question.lower().startswith("y"):
                logger.warning(
                    set_color(
                        "rewriting all executed information, path information, and removing geckodriver",
                        level=30))
                rewrite_all_paths()
                logger.info(
                    set_color(
                        "all paths rewritten, you will be forced to re-install everything next run of Zeus"
                    ))
            else:
                logger.fatal(
                    set_color(
                        "you will need to remove the geckodriver from /usr/bin and reinstall it",
                        level=50))
                shutdown()
        elif "Unable to find a matching set of capabilities" in str(e):
            logger.fatal(
                set_color(
                    "it appears that firefox, selenium, and geckodriver are not playing nice with one another",
                    level=50))
            run_fix(
                "would you like to attempt to resolve this issue automatically",
                "sudo sh {}".format(REINSTALL_TOOL),
                ("you will need to reinstall firefox to a later version, update selenium, and reinstall the "
                 "geckodriver to continue using Zeus"),
                exit_process=True)
        else:
            logger.exception(
                set_color(
                    "{} failed to gather the URL from search engine, caught exception '{}' "
                    "exception has been logged to current log file".format(
                        os.path.basename(__file__),
                        str(e).strip()),
                    level=50))
            request_issue_creation()
        shutdown()
    logger.info(
        set_color("URL successfully gathered, searching for GET parameters"))

    logger.info(set_color(proxy_string_info))

    try:
        req = requests.get(query_url, proxies=proxy_string, params=headers)
    except ConnectionError:
        logger.warning(
            set_color(
                "target machine refused connection, delaying and trying again",
                level=30))
        time.sleep(3)
        req = requests.get(query_url, proxies=proxy_string, params=headers)

    logger.info(set_color(user_agent_info))
    req.headers.update(headers)
    found_urls = URL_REGEX.findall(req.text)
    for urls in list(found_urls):
        for url in list(urls):
            url = unquote(url)
            if not any(u in url for u in URL_EXCLUDES):
                if not url == "http://" and not url == "https://":
                    if URL_REGEX.match(url):
                        if isinstance(url, unicode):
                            url = str(url).encode("utf-8")
                        if pull_all:
                            retval.add(url.split(splitter)[0])
                        else:
                            if URL_QUERY_REGEX.match(url.split(splitter)[0]):
                                retval.add(url.split(splitter)[0])
                        if verbose:
                            try:
                                logger.debug(
                                    set_color("found '{}'".format(
                                        url.split(splitter)[0]),
                                              level=10))
                            except TypeError:
                                logger.debug(
                                    set_color("found '{}'".format(
                                        str(url).split(splitter)[0]),
                                              level=10))
                            except AttributeError:
                                logger.debug(
                                    set_color("found '{}".format(str(url)),
                                              level=10))
                        if url is not None:
                            retval.add(url.split(splitter)[0])
    true_retval = set()
    for url in list(retval):
        if any(l in url for l in possible_leftovers):
            url = URLParser(url).strip_url_leftovers()
        if parse_webcache:
            if "webcache" in url:
                logger.info(set_color("found a webcache URL, extracting"))
                url = URLParser(url).extract_webcache_url()
                if verbose:
                    logger.debug(set_color("found '{}'".format(url), level=15))
                true_retval.add(url)
            else:
                true_retval.add(url)
        else:
            true_retval.add(url)

    if len(true_retval) != 0:
        file_path = write_to_log_file(true_retval, URL_LOG_PATH, URL_FILENAME)
        if show_success:
            amount_of_urls = len(open(file_path).readlines())
            success_rate = calculate_success(amount_of_urls)
            logger.info(
                set_color("provided query has a {} success rate".format(
                    success_rate)))
    else:
        logger.fatal(
            set_color(
                "did not find any URLs with given query '{}' writing query to blacklist"
                .format(query),
                level=50))
        write_to_log_file(query,
                          BLACKLIST_FILE_PATH,
                          BLACKLIST_FILENAME,
                          blacklist=True)
        shutdown()
    logger.info(
        set_color("found a total of {} URLs with given query '{}'".format(
            len(true_retval), query)))
    return list(true_retval) if len(true_retval) != 0 else None