示例#1
0
    else:
        browser_params[i]['save_content'] = SAVE_CONTENT
    browser_params[i]['headless'] = True

# Manager configuration
manager_params['data_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
manager_params['log_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
manager_params['output_format'] = 's3'
manager_params['s3_bucket'] = S3_BUCKET
manager_params['s3_directory'] = CRAWL_DIRECTORY

# Allow the use of localstack's mock s3 service
S3_ENDPOINT = os.getenv('S3_ENDPOINT')
if S3_ENDPOINT:
    boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT)
    manager_params['s3_bucket'] = local_s3_bucket(boto3.resource('s3'),
                                                  name=S3_BUCKET)

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params,
                                  browser_params,
                                  logger_kwargs=LOGGER_SETTINGS)

# At this point, Sentry should be initiated
if SENTRY_DSN:
    # Add crawler.py-specific context
    with sentry_sdk.configure_scope() as scope:
        # tags generate breakdown charts and search filters
        scope.set_tag('NUM_BROWSERS', NUM_BROWSERS)
        scope.set_tag('CRAWL_DIRECTORY', CRAWL_DIRECTORY)
        scope.set_tag('S3_BUCKET', S3_BUCKET)
示例#2
0
        browser_params[i]["save_content"] = SAVE_CONTENT
    if PREFS:
        browser_params[i]["prefs"] = json.loads(PREFS)

# Manager configuration
manager_params["data_directory"] = "~/Desktop/%s/" % CRAWL_DIRECTORY
manager_params["log_directory"] = "~/Desktop/%s/" % CRAWL_DIRECTORY
manager_params["output_format"] = "s3"
manager_params["s3_bucket"] = S3_BUCKET
manager_params["s3_directory"] = CRAWL_DIRECTORY

# Allow the use of localstack's mock s3 service
S3_ENDPOINT = os.getenv("S3_ENDPOINT")
if S3_ENDPOINT:
    boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT)
    manager_params["s3_bucket"] = local_s3_bucket(boto3.resource("s3"),
                                                  name=S3_BUCKET)

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params,
                                  browser_params,
                                  logger_kwargs=LOGGER_SETTINGS)

# At this point, Sentry should be initiated
if SENTRY_DSN:
    # Add crawler.py-specific context
    with sentry_sdk.configure_scope() as scope:
        # tags generate breakdown charts and search filters
        scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY)
        scope.set_tag("S3_BUCKET", S3_BUCKET)
        scope.set_tag("DISPLAY_MODE", DISPLAY_MODE)