def _get_store(db: DatabaseHandler, object_type: str) -> None: """Get the either the s3 store or the postgresql store, depending on the MC_PUBLIC_STORE_TYPE. Production systems should use s3, because urls generated by this module will only work for the s3 store. The postgresql store is only for testing.""" store_type = env_value("MC_PUBLIC_STORE_TYPE") if store_type == 'postgresql': return PostgreSQLStore(table='public_store.%s' % object_type) elif store_type == 's3': access_key_id = env_value("MC_PUBLIC_AMAZON_S3_ACCESS_KEY_ID") secret_access_key = env_value("MC_PUBLIC_AMAZON_S3_SECRET_ACCESS_KEY") bucket_name = env_value("MC_PUBLIC_AMAZON_S3_BUCKET_NAME") directory_name = _get_directory_name(db, object_type) return AmazonS3Store(access_key_id=access_key_id, secret_access_key=secret_access_key, bucket_name=bucket_name, directory_name=directory_name, compression_method=mediawords.key_value_store. KeyValueStore.Compression.GZIP) else: return McPublicStoreUnknownType( f'unknown value for MC_PUBLIC_STORE_TYPE: {store_type}')
def test_env_value_required(): nonexistent_env_name = random_string(length=16) with pytest.raises(McConfigEnvironmentVariableUnsetException): env_value(name=nonexistent_env_name) assert env_value(name=nonexistent_env_name, required=False) is None
def test_env_value_empty_string(): empty_env_name = random_string(length=16) os.environ[empty_env_name] = '' with pytest.raises(McConfigEnvironmentVariableUnsetException): env_value(name=empty_env_name) assert env_value(name=empty_env_name, allow_empty_string=True) == ''
def get_object_hash(object_id: str) -> int: """Hash the object_id with a salt so that it is not discoverable.""" salt = env_value('MC_PUBLIC_STORE_SALT') store_type = env_value('MC_PUBLIC_STORE_TYPE') key = "%s-%s" % (salt, object_id) big_int = int(hashlib.md5(key.encode('utf-8')).hexdigest(), 16) # return just 64 bits of the hash, because that's all the postgresql store can handle return big_int & 0xFFFFFFFFFFFFFFF if store_type == 'postgresql' else big_int
def _get_s3_store() -> None: """Get the amazon s3 store.""" access_key_id = env_value("MC_PUBLIC_AMAZON_S3_ACCESS_KEY_ID") secret_access_key = env_value("MC_PUBLIC_AMAZON_S3_SECRET_ACCESS_KEY") bucket_name = env_value("MC_PUBLIC_AMAZON_S3_BUCKET_NAME") directory_name = env_value("MC_PUBLIC_AMAZON_S3_DIRECTORY_NAME") store = AmazonS3Store( access_key_id=access_key_id, secret_access_key=secret_access_key, bucket_name=bucket_name, directory_name=directory_name, compression_method: mediawords.key_value_store.KeyValueStore.Compression) return store
def parallel_get_num_parallel() -> int: """Parallel connection count.""" value = env_value('MC_USERAGENT_PARALLEL_GET_NUM_PARALLEL', required=False) if value is None: value = 10 return int(value)
def parallel_get_per_domain_timeout() -> int: """Per-domain timeout, in seconds.""" value = env_value('MC_USERAGENT_PARALLEL_GET_PER_DOMAIN_TIMEOUT', required=False) if not value: value = 1 return int(value)
def test_env_value(): random_env_name = random_string(length=16) random_env_value = random_string(length=16) os.environ[random_env_name] = random_env_value assert env_value(name=random_env_name) == random_env_value
def unsubscribe_address() -> str: """Email to which unsubscribe/account deletion requests should be sent""" address = env_value('MC_EMAIL_UNSUBSCRIBE', required=False, allow_empty_string=True) if address is None or '@' not in address: address = '*****@*****.**' return address
def read_all_from_s3() -> bool: """Whether or not to read all non-inline downloads from S3.""" value = env_value('MC_DOWNLOADS_READ_ALL_FROM_S3', required=False, allow_empty_string=True) if value is None: value = 0 return bool(int(value))
def cache_s3() -> bool: """Whether to enable local Amazon S3 download cache.""" value = env_value('MC_DOWNLOADS_CACHE_S3', required=False, allow_empty_string=True) if value is None: value = 0 return bool(int(value))
def blacklist_url_pattern() -> Optional[Pattern]: """URL pattern for which we should fail all of the HTTP(s) requests.""" pattern = env_value('MC_USERAGENT_BLACKLIST_URL_PATTERN', required=False, allow_empty_string=True) if pattern: pattern = re.compile(pattern, flags=re.IGNORECASE | re.UNICODE) else: pattern = None return pattern
def fallback_postgresql_to_s3() -> bool: """Whether to fallback PostgreSQL downloads to Amazon S3. If the download doesn't exist in PostgreSQL storage, S3 will be tried instead.""" value = env_value('MC_DOWNLOADS_FALLBACK_POSTGRESQL_TO_S3', required=False, allow_empty_string=True) if value is None: value = 0 return bool(int(value))
def _get_api_key() -> str: """Fetch the bw api key or use the cached one. To get a bw api key, you have to make an api call with the user and password, but the api key only lasts for a year, so we just get it and then cache it in a static variable, assuming that each run time will restart at least once a year. """ if hasattr(_get_api_key, "api_key"): return _get_api_key.api_key user = env_value('MC_BRANDWATCH_USER') password = env_value('MC_BRANDWATCH_PASSWORD') log.debug(f"user: {user}") log.debug(f"passwod: {password}") ua = _get_user_agent() url = ( "https://api.brandwatch.com/oauth/token?username=%s&grant_type=api-password&client_id=brandwatch-api-client" % (quote(user))) request = Request(method='POST', url=url) request.set_content_type( 'application/x-www-form-urlencoded; charset=utf-8') request.set_content({'password': password}) response = ua.request(request) if not response.is_success(): raise McPostsBWTwitterDataException("error fetching posts: " + response.decoded_content()) json = response.decoded_content() data = dict(decode_json(json)) try: _get_api_key.api_key = data['access_token'] except: raise McPostsBWTwitterDataException( "error parsing ouath response: '%s'" % json) return _get_api_key.api_key
def storage_locations() -> List[str]: """Download storage locations.""" value = env_value('MC_DOWNLOADS_STORAGE_LOCATIONS', required=False) if value is None: value = 'postgresql' locations = value.split(';') locations = [location.strip() for location in locations] if len(locations) == 0 and locations[0] == '': locations = [] return locations
def topic_alert_emails() -> List[str]: """List of emails to which to send all topic alerts.""" emails = env_value('MC_TOPICS_BASE_TOPIC_ALERT_EMAILS', required=False, allow_empty_string=True) if emails is None: emails = "[email protected], [email protected]" emails = emails.split(',') emails = [email.strip() for email in emails] if len(emails) == 0 and emails[0] == '': emails = [] return emails
def _get_directory_name(db, object_type: str) -> str: """Get the directory name either from the env var or from the database.""" # MC_PUBLIC_AMAZON_S3_DIRECTORY_NAME should be unique for production to prevent overwriting try: directory_name = env_value("MC_PUBLIC_AMAZON_S3_DIRECTORY_NAME") except McConfigEnvironmentVariableUnsetException: directory_name = _get_test_directory_name_from_db(db) full_path = f'{directory_name}/{object_type}' return full_path
def _default_path_prefix(self) -> str: return env_value(name='MC_PODCAST_TRANSCRIPTS_PATH_PREFIX')
def directory_name() -> str: """Directory name (prefix).""" return env_value('MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME', allow_empty_string=True)
def bucket_name() -> str: """Bucket name.""" return env_value('MC_DOWNLOADS_AMAZON_S3_BUCKET_NAME')
def secret_access_key() -> str: """Secret access key.""" return env_value('MC_DOWNLOADS_AMAZON_S3_SECRET_ACCESS_KEY')
def access_key_id() -> str: """Access key ID.""" return env_value('MC_DOWNLOADS_AMAZON_S3_ACCESS_KEY_ID')
def email_from_address() -> str: """'From:' email address when sending emails.""" value = env_value('MC_EMAIL_FROM_ADDRESS', required=False) if value is None: value = '*****@*****.**' return value
def parallel_get_timeout() -> int: """Connection timeout, in seconds.""" value = env_value('MC_USERAGENT_PARALLEL_GET_TIMEOUT', required=False) if value is None: value = 90 return int(value)
def authenticated_domains() -> List[AuthenticatedDomain]: """List of authenticated domains.""" value = env_value('MC_USERAGENT_AUTHENTICATED_DOMAINS', required=False, allow_empty_string=True) return _authenticated_domains_from_json(value)
def univision_client_id() -> Optional[str]: """"Univision API client ID.""" return env_value(name='MC_UNIVISION_CLIENT_ID', required=False, allow_empty_string=True)
def univision_client_secret() -> Optional[str]: """Univision API client secret (secret key).""" return env_value(name='MC_UNIVISION_CLIENT_SECRET', required=False, allow_empty_string=True)
def tag_set() -> str: """NYTLabels version tag, e.g. "nyt_labeller_v1.0.0". Will be added under "geocoder_version" tag set.""" return env_value('MC_NYTLABELS_TAG_SET')
def _default_bucket_name(self) -> str: return env_value(name='MC_PODCAST_TRANSCRIPTS_BUCKET_NAME')
def version_tag() -> str: """NYTLabels version tag, e.g. "nyt_labeller_v1.0.0". Will be added under "geocoder_version" tag set.""" return env_value('MC_NYTLABELS_VERSION_TAG')