Пример #1
0
def watch_crawl(crawl_id):
    """ Watch crawling browsers in local browser

        :param crawl_id: list of crawl ids to watch
    """
    for id_ in crawl_id:
        res = sesh_get('/crawl/{0}'.format(id_))

        if res.get('headless'):
            if not is_quiet():
                print("Can not watch, crawl is running in headless mode")
                continue

        if res.get('status') != 'running':
            if not is_quiet():
                print('Crawl not running: {0}'.format(id_))
                continue

        browsers = res['browsers']

        done_count = defaultdict(int)

        for info in res.get('tabs_done'):
            done_count[info['id']] += 1

        if not browsers:
            if not is_quiet():
                print('No Browsers')
                continue

        open_browsers(browsers, id_, done_count, res['num_tabs'])
Пример #2
0
def create_profile(browser):
    res = sesh_get('/api/request/{0}/about:blank'.format(browser),
                   prefix=settings.shepherd_prefix)

    reqid = res.get('reqid')

    curr_browser = None

    webbrowser.open(settings.view_browsers_prefix + reqid)

    print('A new browser window should have been opened')
    print(
        'You can use the browser to log-in to accounts or otherwise prepare the browser profile'
    )
    print('(The content will not be recorded to WARC)')

    while True:
        profile_name = click.prompt(
            'When done, please enter a new name to save the browser profile',
            type=str)

        if not curr_browser:
            curr_browser = docker_api.containers.get('browser-' + reqid)

        # exit_code, output = curr_browser.exec_run('/app/prep-commit.sh')
        exit_code, output = curr_browser.exec_run(
            'pkill -f "/usr/bin/google-chrome"')
        if not is_quiet():
            print('Killed Chrome to Save Profile for Commit')
            print('Result: {0}'.format(exit_code))
            print(output.decode('utf-8'))

        time.sleep(1.5)

        conf = {
            'Labels': {
                LABEL_BROWSERPROFILE: profile_name,
                LABEL_BASEBROWSER: browser
            }
        }

        res = curr_browser.commit(
            repository=PROFILE_PREFIX[:-1],
            tag=profile_name,
            message='Browser Profile',
            conf=conf,
        )

        if not is_quiet():
            print('Created Image: {0} ({1})'.format(res.tags[0], res.short_id))

        print('The browser should have restarted to about:blank')
        if not click.confirm('Continue browsing to create another profile?'):
            break
Пример #3
0
def remove_profile(profile):
    full_tag = PROFILE_PREFIX + profile

    try:
        docker_api.images.remove(full_tag, force=True, noprune=False)
        if not is_quiet():
            print('Removed profile "{0}"!'.format(profile))

    except docker.errors.ImageNotFound:
        if not is_quiet():
            print('Profile "{0}" not found'.format(profile))
        sys.exit(1)
Пример #4
0
def list_crawls():
    """ List all available crawls
    """
    res = sesh_get('/crawls')

    sorted_list = sorted(res['crawls'],
                         key=lambda x: x['start_time'],
                         reverse=True)

    if is_quiet():
        for crawl in sorted_list:
            print(crawl['id'])

        return

    format_str = '{value: <{size}}  '

    for _, text, size in COLUMNS:
        sys.stdout.write(format_str.format(value=text, size=size))
    print()

    for crawl in sorted_list:
        for field, _, size in COLUMNS:
            value = crawl[field]
            if field == 'start_time':
                value = format_duration(value, None) + ' ago'
            elif field == 'finish_time':
                value = format_duration(crawl['start_time'], value)

            sys.stdout.write(format_str.format(value=value, size=size))
        print()
    print()
Пример #5
0
def remove_all():
    """ Stop and remove all crawls
    """
    res = sesh_get('/crawls')

    crawls = res['crawls']

    for crawl in crawls:
        id_ = crawl['id']
        res = sesh_delete('/crawl/{0}'.format(id_))
        if not is_quiet():
            print('Removed Crawl: {0}'.format(id_))
Пример #6
0
def start_crawl(crawl_id, browser, headless, behavior_time):
    """ Start an existing crawl

        :param crawl_id: list of crawl ids to start
    """
    for id_ in crawl_id:
        res = sesh_post('/crawl/{0}/start'.format(id_))

        if is_quiet():
            print(res['id'])
        else:
            print('Started Crawl: {0}'.format(res['id']))
Пример #7
0
def list_profiles():
    res = docker_api.images.list(filters={'label': LABEL_BROWSERPROFILE})

    format_str = '{profile: <16}  {base}'
    if not is_quiet():
        print(format_str.format(profile='PROFILE', base='BASE BROWSER'))

    for image in res:
        if not image.tags:
            continue

        if not image.tags[0].startswith(PROFILE_PREFIX):
            continue

        profile = image.tags[0][len(PROFILE_PREFIX):]
        base_browser = image.labels.get(LABEL_BASEBROWSER, '(unknown)')

        if not is_quiet():
            print(format_str.format(profile=profile, base=base_browser))
        else:
            print(profile)

    if not is_quiet():
        print()
Пример #8
0
def get_profile_image(profile):
    try:
        global docker_api
        if not docker_api:
            docker_api = docker.from_env(version='auto')

        image_name = PROFILE_PREFIX + profile
        image = docker_api.images.get(image_name)
        assert image.labels.get(LABEL_BROWSERPROFILE) == profile
        return 'profile:' + profile

    except (docker.errors.ImageNotFound, AssertionError):
        if not is_quiet():
            print('Profile "{0}" not found'.format(profile))
        sys.exit(1)
Пример #9
0
def remove_crawl(crawl_id):
    """ Remove one or more existing crawls

        :param crawl_id: list of crawl ids to stop
    """
    for id_ in crawl_id:
        res = sesh_delete('/crawl/{0}'.format(id_))

        if not res.get('success'):
            print('Error removing: ' + res)
            return

        if is_quiet():
            print(id_)
        else:
            print('Removed Crawl: {0}'.format(id_))
Пример #10
0
def stop_crawl(crawl_id):
    """ Stop one or more existing crawls

        :param crawl_id: list of crawl ids to stop
    """
    for id_ in crawl_id:
        res = sesh_post('/crawl/{0}/stop'.format(id_))

        if not res.get('success'):
            print('Error stopping: ' + res)
            return

        if is_quiet():
            print(id_)
        else:
            print('Stopped Crawl: {0}'.format(id_))
Пример #11
0
def open_browsers(browsers, crawl_id, tabs_done=None, num_tabs=-1):
    count = 1
    for reqid in browsers:
        skip = False
        if not tabs_done or tabs_done.get(reqid) != num_tabs:
            msg = 'Opening Browser {0} of {1} ({2}) for crawl {3}'
        else:
            msg = 'Skipping Finished Browser {0} of {1}, ({2}) for crawl {3}'
            skip = True

        if not is_quiet():
            print(msg.format(count, len(browsers), reqid, crawl_id))

        if not skip:
            webbrowser.open(settings.view_browsers_prefix + reqid)
        count += 1
Пример #12
0
def create_crawl(
    crawl_spec_file,
    start,
    browser,
    profile,
    coll,
    mode,
    screenshot_coll,
    headless,
    behavior_time,
    watch,
    log,
):
    """ Create a new crawl!

        :param crawl_spec_file: YAML file with one or more crawls in 'crawls' key
        :param start: If true, start crawl immediately after creation
        :param browser: Browser Docker image to use for crawling (overrides setting in spec)
        :param profile: Browser Profile Docker image to use for crawling (overrides "browser" setting)
        :param coll: Set the collection (overrides setting in spec)
        :param mode: Set the capture mode (overrides setting in spec)
        :param screenshot_coll: Set the collection to save screenshots (overrides setting in spec)
        :param headless: Use headless mode. Browsers can not be opened for watching the crawl
        :param behavior_time: Max duration (in seconds) to run each in-page behavior
        :param watch: Watch all started browsers in a local browser (only if starting crawl)

    """
    root = yaml.load(crawl_spec_file, Loader=yaml.Loader)

    for crawl_spec in root['crawls']:
        if not start:
            msg = 'Created'
        else:
            msg = 'Created and Started'

        if headless is not None:
            crawl_spec['headless'] = headless

        if behavior_time is not None:
            crawl_spec['behavior_time'] = behavior_time

        if profile is not None:
            browser = get_profile_image(profile)

        if browser is not None:
            crawl_spec['browser'] = browser

        if coll is not None:
            crawl_spec['coll'] = coll

        if mode is not None:
            crawl_spec['mode'] = mode

        if screenshot_coll is not None:
            crawl_spec['screenshot_coll'] = screenshot_coll

        if not is_quiet():
            print('Creating New Crawl, Please Wait...')

        res = sesh_post('/crawls', json=crawl_spec)

        if is_quiet():
            print(res['id'])
        else:
            print('Crawl {0}: {1}'.format(msg, res['id']))
            print('Status: {0}'.format(res['status']))

        if watch:
            if not start:
                if not is_quiet():
                    print("Can't watch, crawl not started")

            elif headless:
                if not is_quiet():
                    print("Can't watch, crawl is running in headless mode")

            else:
                open_browsers(res['browsers'], res['id'])

        if log:
            print_logs(res['browsers'], follow=True, wait=True)