Exemplo n.º 1
0
def brozzle_page():
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None,
            help='http proxy')
    arg_parser.add_argument(
            '--enable-warcprox-features', dest='enable_warcprox_features',
            action='store_true', help=(
                'enable special features that assume the configured proxy '
                'is warcprox'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            id=-1, seed=args.url, proxy=args.proxy,
            enable_warcprox_features=args.enable_warcprox_features)
    page = brozzler.Page(url=args.url, site_id=site.id)
    worker = brozzler.BrozzlerWorker(frontier=None)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    browser.start(proxy=site.proxy)
    try:
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
Exemplo n.º 2
0
def brozzle_page(argv=None):
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        description='brozzle-page - brozzle a single page',
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '--behavior-parameters',
        dest='behavior_parameters',
        default=None,
        help=('json blob of parameters to populate the javascript behavior '
              'template, e.g. {"parameter_username":"******",'
              '"parameter_password":"******"}'))
    arg_parser.add_argument(
        '--username',
        dest='username',
        default=None,
        help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
        '--password',
        dest='password',
        default=None,
        help='use this password to try to log in if a login form is found')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    behavior_parameters = {}
    if args.behavior_parameters:
        behavior_parameters = json.loads(args.behavior_parameters)
    site = brozzler.Site(
        None, {
            'id': -1,
            'seed': args.url,
            'behavior_parameters': behavior_parameters,
            'username': args.username,
            'password': args.password
        })
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
    worker = brozzler.BrozzlerWorker(
        frontier=None,
        proxy=args.proxy,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
            ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
            datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    try:
        browser.start(proxy=args.proxy)
        outlinks = worker.brozzle_page(
            browser,
            site,
            page,
            on_screenshot=on_screenshot,
            enable_youtube_dl=not args.skip_youtube_dl)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
Exemplo n.º 3
0
def brozzle_page(argv=None):
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chrome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--behavior-parameters', dest='behavior_parameters',
            default=None, help=(
                'json blob of parameters to populate the javascript behavior '
                'template, e.g. {"parameter_username":"******",'
                '"parameter_password":"******"}'))
    arg_parser.add_argument(
            '--username', dest='username', default=None,
            help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
            '--password', dest='password', default=None,
            help='use this password to try to log in if a login form is found')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None, help='http proxy')
    arg_parser.add_argument(
            '--skip-extract-outlinks', dest='skip_extract_outlinks',
            action='store_true')
    arg_parser.add_argument(
            '--skip-visit-hashtags', dest='skip_visit_hashtags',
            action='store_true')
    arg_parser.add_argument(
            '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true')
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)
    brozzler.chrome.check_version(args.chrome_exe)

    behavior_parameters = {}
    if args.behavior_parameters:
        behavior_parameters = json.loads(args.behavior_parameters)
    site = brozzler.Site(None, {
        'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
        'username': args.username, 'password': args.password})
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
    worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    try:
        browser.start(proxy=args.proxy)
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot,
                enable_youtube_dl=not args.skip_youtube_dl)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    except brozzler.PageInterstitialShown as e:
        logging.error('page interstitial shown %s', e)
    finally:
        browser.stop()
Exemplo n.º 4
0
def brozzle_page():
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chrome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--behavior-parameters', dest='behavior_parameters',
            default=None, help=(
                'json blob of parameters to populate the javascript behavior '
                'template, e.g. {"parameter_username":"******",'
                '"parameter_password":"******"}'))
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None,
            help='http proxy')
    arg_parser.add_argument(
            '--enable-warcprox-features', dest='enable_warcprox_features',
            action='store_true', help=(
                'enable special features that assume the configured proxy '
                'is warcprox'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    behavior_parameters = {}
    if args.behavior_parameters:
        behavior_parameters = json.loads(args.behavior_parameters)
    site = brozzler.Site(
            id=-1, seed=args.url, proxy=args.proxy,
            enable_warcprox_features=args.enable_warcprox_features,
            behavior_parameters=behavior_parameters)
    page = brozzler.Page(url=args.url, site_id=site.id)
    worker = brozzler.BrozzlerWorker(frontier=None)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    browser.start(proxy=site.proxy)
    try:
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()