예제 #1
0
def run():
    # Parse arguments
    args, config = parse_arguments()

    # build dirs
    build_crawl_dirs(args.url_file)

    # Read URLs
    url_list = parse_url_list(args.url_file, args.start, args.stop)

    # Configure logger
    add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(cm.TBB_DIR,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    driver = TorBrowserWrapper(cm.TBB_DIR,
                               tbb_logfile_path=cm.DEFAULT_FF_LOG,
                               tor_cfg=USE_RUNNING_TOR,
                               pref_dict=ffprefs,
                               socks_port=int(torrc_config['socksport']))

    # Instantiate crawler
    crawler = crawler_mod.Crawler(driver, controller, args.screenshots,
                                  args.device)

    # Configure crawl
    job_config = ut.get_dict_subconfig(config, args.config, "job")
    job = crawler_mod.CrawlJob(job_config, url_list)

    # Setup stem headless display
    if args.virtual_display:
        xvfb_h = int(args.virtual_display.split('x')[0])
        xvfb_w = int(args.virtual_display.split('x')[1])
    else:
        xvfb_h = cm.DEFAULT_XVFB_WIN_H
        xvfb_w = cm.DEFAULT_XVFB_WIN_W
    xvfb_display = start_xvfb(xvfb_w, xvfb_h)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    finally:
        # Post crawl
        post_crawl()

        # Close display
        stop_xvfb(xvfb_display)

    # die
    sys.exit(0)
예제 #2
0
    def __init__(self, 
                 take_ownership=True, # Tor dies when the Crawler does
                 torrc_config={"CookieAuth": "1"},
                 tor_log="/var/log/tor/tor.log",
                 tor_cell_log="/var/log/tor/tor_cell_seq.log",
                 control_port=9051,
                 socks_port=9050, 
                 run_in_xvfb=True,
                 tbb_path=join("/opt","tbb","tor-browser_en-US"),
                 tb_log_path=join(_log_dir,"firefox.log"),
                 tb_tor_cfg=USE_RUNNING_TOR,
                 page_load_timeout=20,
                 wait_on_page=5,
                 wait_after_closing_circuits=0,
                 restart_on_sketchy_exception=True,
                 additional_control_fields={},
                 db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(config=self.torrc_config,
                                                  take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)
예제 #3
0
def launch_tb_with_custom_stem(tbb_dir):
    xvfb_display = start_xvfb()
    socks_port = free_port()
    control_port = free_port()
    tor_data_dir = tempfile.mkdtemp()
    tor_binary = join(tbb_dir, cm.DEFAULT_TOR_BINARY_PATH)
    print("SOCKS port: %s, Control port: %s" % (socks_port, control_port))

    torrc = {
        'ControlPort': str(control_port),
        'SOCKSPort': str(socks_port),
        'DataDirectory': tor_data_dir
    }
    tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir,
                                           torrc=torrc,
                                           tor_binary=tor_binary)
    with Controller.from_port(port=control_port) as controller:
        controller.authenticate()
        with TorBrowserDriver(tbb_dir,
                              socks_port=socks_port,
                              control_port=control_port,
                              tor_cfg=cm.USE_STEM) as driver:
            driver.load_url("https://check.torproject.org", wait_on_page=3)
            print(driver.find_element_by("h1.on").text)
            print(driver.find_element_by(".content > p").text)
        print_tor_circuits(controller)

    stop_xvfb(xvfb_display)
    tor_process.kill()
예제 #4
0
def headless_visit(tbb_dir):
    out_img = join(dirname(realpath(__file__)), "headless_screenshot.png")
    # start a virtual display
    xvfb_display = start_xvfb()
    with TorBrowserDriver(tbb_dir) as driver:
        driver.load_url("https://check.torproject.org")
        driver.get_screenshot_as_file(out_img)
        print("Screenshot is saved as %s" % out_img)

    stop_xvfb(xvfb_display)
예제 #5
0
def headless_visit(tbb_dir):
    out_img = join(dirname(realpath(__file__)), "headless_screenshot.png")
    # start a virtual display
    xvfb_display = start_xvfb()
    with TorBrowserDriver(tbb_dir) as driver:
        for i in range(len(load_table)):
            start_time = time.clock_gettime_ns(time.CLOCK_REALTIME)
            driver.load_url(load_table[i][URLS])
            end_time = time.clock_gettime_ns(time.CLOCK_REALTIME)

            driver.get_screenshot_as_file(out_img)
            print("Screenshot is saved as %s" % out_img)

            elapsed_time = (end_time - start_time) / 1000000000
            print("Load time: ", str(elapsed_time) + "s")
            load_table[i][VANILLA] = elapsed_time

    col = -1
    for bridge in BRIDGE_TYPE:
        with TorBrowserDriver(tbb_dir, default_bridge_type=bridge) as bdriver:
            if bridge == "obfs4":
                col = 2
                print("obfs4..........")
            if bridge == "meek-azure":
                col = 3
                print("meek-azure..........")
            if col == -1:
                break

            for i in range(len(load_table)):
                start_time = time.clock_gettime_ns(time.CLOCK_REALTIME)
                bdriver.load_url(load_table[i][URLS])
                end_time = time.clock_gettime_ns(time.CLOCK_REALTIME)

                bdriver.get_screenshot_as_file(out_img)
                print("Screenshot is saved as %s" % out_img)

                elapsed_time = (end_time - start_time) / 1000000000
                print("Load time: ", str(elapsed_time) + "s")
                load_table[i][col] = elapsed_time
    print("About to print..........")
    write_csv()
    stop_xvfb(xvfb_display)
예제 #6
0
파일: test.py 프로젝트: day0x0000/pytortest
def main():
    global workdir
    desc = "Take a screenshot using TorBrowserDriver"
    default_url = "https://check.torproject.org"
    parser = ArgumentParser(description=desc)
    parser.add_argument('tbb_path')
    parser.add_argument('output_dir', default=workdir)
    parser.add_argument('url', nargs='?', default=default_url)
    args = parser.parse_args()
    out_img = realpath(join(args.output_dir, "screenshot.png"))

    if default_url is None:
        print("ERROR: cannot detect main URL")
        return 1

    xvfb_display = start_xvfb()

    with TorBrowserDriver(args.tbb_path, headless=True) as driver:
        visit_and_screenshot(driver, default_url, out_img)

    stop_xvfb(xvfb_display)
def pullpage():

    xvfb_display = start_xvfb()
    t1 = 'https://whatismyipaddress.com/'
    t2 = "https://www.bulq.com/"
    t3 = 'https://cultofrick.com'
    target = t1

    driver = webdriver.Firefox()
    #TorBrowserDriver.FirefoxProfile();

    #driver=TorBrowserDriver('/GMDelight/GMDelight/webtools/tor-browser_en-US')
    driver.get(target)
    driver.refresh()
    #print(driver.page_source)
    driver.get_screenshot_as_file(
        '/GMDelight/GMDelight/static/headless_screenshot.png')
    pgsource = str(driver.page_source)
    driver.quit()
    stop_xvfb(xvfb_display)
    return pgsource
예제 #8
0
    def __init__(
            self,
            take_ownership=True,  # Tor dies when the Crawler does
            torrc_config={"CookieAuth": "1"},
            tor_log="/var/log/tor/tor.log",
            tor_cell_log="/var/log/tor/tor_cell_seq.log",
            control_port=9051,
            socks_port=9050,
            run_in_xvfb=True,
            tbb_path=join("/opt", "tbb", "tor-browser_en-US"),
            tb_log_path=join(_log_dir, "firefox.log"),
            tb_tor_cfg=USE_RUNNING_TOR,
            page_load_timeout=20,
            wait_on_page=5,
            wait_after_closing_circuits=0,
            restart_on_sketchy_exception=True,
            additional_control_fields={},
            db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(
            config=self.torrc_config, take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)
예제 #9
0
def pytest_sessionstart(session):
    if "TRAVIS" not in environ and "NO_XVFB" not in environ:
        test_conf["xvfb_display"] = start_xvfb()
    if "TRAVIS" in environ:
        test_conf["tor_process"] = launch_tor()
예제 #10
0
from tbselenium.utils import start_xvfb, stop_xvfb
from tbselenium.tbdriver import TorBrowserDriver
from os.path import join, dirname, realpath

out_img = join(dirname(realpath(__file__)), "headless_screenshot.png")
xvfb_display = start_xvfb()
with TorBrowserDriver(
        '/home/manivannan/pythonexamle/selenium_example/tor-browser_en-US'
) as driver:
    driver.load_url("https://check.torproject.org")
    driver.get_screenshot_as_file(out_img)
    print("Screenshot is saved as %s" % out_img)

stop_xvfb(xvfb_display)
예제 #11
0
def pytest_sessionstart(session):
    if ("TRAVIS" not in environ and
            ("NO_XVFB" not in environ or environ["NO_XVFB"] != "1")):
        test_conf["xvfb_display"] = start_xvfb()
    test_conf["temp_data_dir"], test_conf["tor_process"] = launch_tor()
def pytest_sessionstart(session):
    if ("TRAVIS" not in environ
            and ("NO_XVFB" not in environ or environ["NO_XVFB"] != "1")):
        test_conf["xvfb_display"] = start_xvfb()
    test_conf["temp_data_dir"], test_conf["tor_process"] = launch_tor()
예제 #13
0
    def prepare_driver(disable_cookies=False,
                       tor=False,
                       v=False,
                       headless=False):
        """Prepares a Selenium webdriver given multiple args.

        Parameters
        ----------
        disable_cookies : Boolean
            True to use a driver in incognito mode with cookies disables.
        tor : Boolean
            True to use a Tor webdriver.
        v : Boolean
            verbosity.
        headless : Boolean
            True to set the webdriver headless, which means not showing the
            Firefox window.

        Returns
        -------
        WebDriver
            A selenium or tbselenium webdriver.
        xvfb_display
            The Xvfb process for hiding the tbselenium webdriver.
        tor_process
            The Stem process for running the tbselenium webdriver.

        """
        options = Options()
        if headless and v:
            print("Setting headless mode...")
        options.headless = headless
        if disable_cookies:
            firefox_profile = webdriver.FirefoxProfile()
            # set incognito mode
            firefox_profile.set_preference("browser.privatebrowsing.autostart",
                                           True)
            # disable cookies
            firefox_profile.set_preference("network.cookie.cookieBehavior", 2)
            driver = webdriver.Firefox(options=options,
                                       firefox_profile=firefox_profile)
        elif not tor:
            driver = webdriver.Firefox(options=options)
        else:
            if v:
                print("Configuring tor browser...")
            tbb_dir = Driver.TOR_PATH
            if headless:
                xvfb_display = start_xvfb()
            try:
                tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir)
            except OSError as e:
                if 'timeout' in str(e):
                    print(
                        'Error: Tor connection timeout. Check URL or Internet connection'
                    )
                    return None, None, None
                else:
                    raise e

            # Tor driver constructor
            driver = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
            if headless:
                return driver, xvfb_display, tor_process
            else:
                return driver, None, tor_process
        return driver, None, None
예제 #14
0
def launch_tor_with_custom_stem(datalist, browser):
    print("length of data: ", len(datalist))
    tor_binary = join(cm.TorProxypath, cm.DEFAULT_TOR_BINARY_PATH)
    tor_process, controller = 0, 0
    try:
        TRYTOR_CNT = cm.TRYCNT
        while TRYTOR_CNT > 0 and tor_process == 0 and controller == 0:
            print("try to setup tor:", str(TRYTOR_CNT))
            tor_process, controller = TorSetup(tor_binary)
            TRYTOR_CNT -= 1
        if tor_process == 0:
            raise TorSetupError
        print("finish tor proxy setup...")
        xvfb_display = start_xvfb()  # virtual display
        for ele in datalist:
            t = getTime()
            savepath, out_img = SetOutputPath(ele, t)
            p = 0
            try:
                driver, TRYCNT = 0, cm.TRYCNT
                while driver == 0 and TRYCNT != 0:
                    print("try to setup tbb:", str(TRYCNT))
                    args = (cm.driverpath, controller,
                            ele[2]) if browser == 'TBB' else ()
                    options = {
                        'TBB': TBBSetup,
                        'FF': FFSetup,
                        'CR': ChromeSetup
                    }
                    driver = options[browser](*args)
                    TRYCNT -= 1
                if driver == 0:
                    raise TBBSetupError

                cmd = "tcpdump -i %s tcp and not port ssh -w %s" % (
                    cm.netInterface, savepath)
                print('cmd = ', cmd)
                cmd = cmd.split(' ')
                p = subprocess.Popen(cmd)
                try:
                    timeout(cm.VISITPAGE_TIMEOUT)
                    driver.get('https://' + ele[0])
                    cancel_timeout()
                    time.sleep(cm.DURATION_VISIT_PAGE)
                    p.terminate()
                    if (ele[2] == 0 or ele[2] == 2):
                        driver.get_screenshot_as_file(out_img)
                    writeLog(str(t) + "," + ele[0] + "," + str(ele[2]))
                    print("Finish tcpdump sleep...")
                except TimeExceededError:
                    writeLog("Error crawling," + ele[0] + "," + str(ele[2]) +
                             "\n" + str("Page visit Timeout"))
                finally:
                    cancel_timeout()
            except TBBSetupError:
                print("[crawl.py error]: unable to setup TBB")
                writeLog("[crawl.py error]: unable to setup TBB")
            except Exception as e:
                with open(cm.ErrorFilePath, 'a+') as fw:
                    fw.write(ele[0] + "," + str(e) + "\n")
                writeLog("Error crawling," + ele[0] + "," + str(ele[2]) +
                         "\n" + str(e))
            finally:
                if p != 0 and p.returncode != 0:
                    try:
                        p.terminate()
                    except Exception as e:
                        writeLog("[crawl.py] tcpdump terminate error: " +
                                 str(e))
                if controller != 0:
                    cleanupStream(controller, str(ele[2]), ele[0])
                if driver != 0:
                    try:
                        timeout(30)
                        driver.quit()
                        cancel_timeout()
                    except Exception as e:
                        cancel_timeout()
                        writeLog("[crawl.py] driver quit error: " + str(e))
                if ele[2] != 3:
                    time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
                else:
                    time.sleep(cm.PAUSE_BETWEEN_SITES)
                RemoveTmpFile()
                RemoveProcess()
    except TorSetupError:
        print("[crawl.py] unable to set up tor proxy")
        writeLog("[crawl.py] unable to set up tor proxy")
    except Exception as e:
        print("[crawl.py]launch_tor_with_custom_stem Error")
        print("Error:", str(e))
        writeLog("[crawl.py]launch_tor_with_custom_stem Error : " + str(e))
    finally:
        if tor_process != 0:
            tor_process.kill()
        stop_xvfb(xvfb_display)