def _get_default_args_list(self): """ Return the base list of args for sherlock that remains unchanged across multiple invocations of this worker's process_msg. """ args = [ # First arg, does not matter "ingest_multiple_dates.py", # We do not want to record status in redshift "--skip-progress-in-redshift", # run parallelism in the worker instead "--serial-stepper", ] cur_base_dir = os.getcwd() if self._should_run_local: private_file = os.path.join( cur_base_dir, staticconf.read_string("run_local.private", "private.yaml") ) args.extend(["-r"]) # For run-local in sherlock else: private_file = staticconf.read_string("run_service.private") if self._config_override_loc is not None: args.extend(["--config-override", self._config_override_loc]) args.extend(["--private", private_file]) args.extend(["--config", self._config_loc]) return args
def create_emr_args(date_with_slashes, cores, infile_prefix, local): """creates a string containing arguments for mr job inputs: date_with_slashes -- a date string of the form 'YYYY/MM/DD' cores -- the number of cores to use for a conversion infile_prefix -- the prefix to the search bucket delimiter -- column delimiter for S3 output outputs: string containing arguments used by ET mr job""" input_file = infile_prefix + date_with_slashes +\ read_string('pipeline.et_step.s3_input_suffix') user_prefix = get_s3_output_user_prefix() output_file = os.path.join(user_prefix, date_with_slashes) if int(cores) > MAX_CORES: cores = MAX_CORES extractions = pipeline_yaml_schema_file_path() delimiter = read_string('redshift_column_delimiter') if local: template = read_string('run_local.mrjob_arg_template') else: template = read_string('run_service.mrjob_arg_template') return template.format(input_file, output_file, cores, extractions, delimiter)
def test_create_emr_args(input_date, dev, cores, pipeline_yaml): print "just starting" load_package_config('config.yaml') YamlConfiguration(pipeline_yaml) input_prefix = read_list('pipeline.et_step.s3_prefixes')[0] input_file = input_prefix + input_date + '/part-*.gz' expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS expected_out_file = read_string('pipeline.s3_output_prefix') delimiter = read_string('redshift_column_delimiter') with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}): logname = os.environ['LOGNAME'] expected_out_file = os.path.join( expected_out_file.format(logname=logname), input_date ) extractions = pipeline_yaml_schema_file_path() formatted_args = expected_args.format(input_file, expected_out_file, cores, extractions, delimiter) output_under_test = create_emr_args(input_date, 10, input_prefix, dev) assert output_under_test == formatted_args
def create_emr_args(date_with_slashes, cores, infile_prefix, local): """creates a string containing arguments for mr job inputs: date_with_slashes -- a date string of the form 'YYYY/MM/DD' cores -- the number of cores to use for a conversion infile_prefix -- the prefix to the search bucket delimiter -- column delimiter for S3 output outputs: string containing arguments used by ET mr job""" input_file = infile_prefix + date_with_slashes +\ read_string('pipeline.et_step.s3_input_suffix') user_prefix = get_s3_output_user_prefix() output_file = os.path.join(user_prefix, date_with_slashes) if int(cores) > MAX_CORES: cores = MAX_CORES extractions = pipeline_yaml_schema_file_path() delimiter = read_string('redshift_column_delimiter') if local: template = read_string('run_local.mrjob_arg_template') else: template = read_string('run_service.mrjob_arg_template') return template.format( input_file, output_file, cores, extractions, delimiter )
def test_setup_config_cluster(cluster, pool, scheduler, tag, mock_config_files): args = argparse.Namespace( env_config_path='/nail/etc/config.yaml', cluster=cluster, pool=pool, scheduler=scheduler, signals_branch_or_tag=tag, ) with mock.patch( 'clusterman.config.load_cluster_pool_config', autospec=True, ) as mock_pool_load, mock.patch('clusterman.config._load_module_configs', ) as mock_load_module_configs: config.setup_config(args) assert mock_load_module_configs.call_args == mock.call( '/nail/etc/config.yaml') assert staticconf.read_string('aws.region') == 'us-test-3' if pool: assert mock_pool_load.call_args == mock.call( cluster, pool, scheduler, tag) else: assert mock_pool_load.call_count == 0 if tag: assert staticconf.read_string( 'autoscale_signal.branch_or_tag') == tag
def main(): staticconf.YamlConfiguration(CONFIG_FILE) auth = OAuthHandler( staticconf.read_string('twitter.consumer_key'), staticconf.read_string('twitter.consumer_secret'), ) auth.set_access_token( staticconf.read_string('twitter.access_token'), staticconf.read_string('twitter.access_token_secret'), ) api = API(auth) big_ben_ids = [723360691457945600, 723375789467553793, 723390890664824834, 723405988221489154, 723421087703261186, 723436186644025344, 723451541563138052, 723466386304057344, 723481486737985536, 723497089410457600, 723511939465392128, 723528048931430400, 723541884208091137, 723556981991202816, 723572081485615104, 723587184276721665, 723602282374414338, 723617381017374720, 723632480964759553, 723647581516124160, 723662932664524800, 723678284538589184, 723693384272121857, 723709493939453952, 723723076614164480] for startid, endid in zip(big_ben_ids, big_ben_ids[1:]): for tweet in limit_handled(Cursor(api.search, q=' OR '.join('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'), since_id=str(startid), max_id=str(endid), lang="en").items(2500)): print(json.dumps(tweet._json))
def _get_default_args_list(self): """ Return the base list of args for sherlock that remains unchanged across multiple invocations of this worker's process_msg. """ args = [ # First arg, does not matter "ingest_multiple_dates.py", # We do not want to record status in redshift "--skip-progress-in-redshift", # run parallelism in the worker instead "--serial-stepper", ] cur_base_dir = os.getcwd() if self._should_run_local: private_file = os.path.join( cur_base_dir, staticconf.read_string("run_local.private", "private.yaml")) args.extend(["-r"]) # For run-local in sherlock else: private_file = staticconf.read_string("run_service.private") if self._config_override_loc is not None: args.extend(["--config-override", self._config_override_loc]) args.extend(["--private", private_file]) args.extend(["--config", self._config_loc]) return args
def main(): staticconf.YamlConfiguration(CONFIG_FILE) auth = OAuthHandler( staticconf.read_string('twitter.consumer_key'), staticconf.read_string('twitter.consumer_secret'), ) auth.set_access_token( staticconf.read_string('twitter.access_token'), staticconf.read_string('twitter.access_token_secret'), ) api = API(auth) big_ben_ids = [ 723360691457945600, 723375789467553793, 723390890664824834, 723405988221489154, 723421087703261186, 723436186644025344, 723451541563138052, 723466386304057344, 723481486737985536, 723497089410457600, 723511939465392128, 723528048931430400, 723541884208091137, 723556981991202816, 723572081485615104, 723587184276721665, 723602282374414338, 723617381017374720, 723632480964759553, 723647581516124160, 723662932664524800, 723678284538589184, 723693384272121857, 723709493939453952, 723723076614164480 ] for startid, endid in zip(big_ben_ids, big_ben_ids[1:]): for tweet in limit_handled( Cursor( api.search, q=' OR '.join( 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890' ), since_id=str(startid), max_id=str(endid), lang="en").items(2500)): print(json.dumps(tweet._json))
def main(): staticconf.YamlConfiguration(CONFIG_FILE) auth = OAuthHandler( staticconf.read_string('twitter.consumer_key'), staticconf.read_string('twitter.consumer_secret'), ) auth.set_access_token( staticconf.read_string('twitter.access_token'), staticconf.read_string('twitter.access_token_secret'), ) api = API(auth) big_ben_ids = [ 727256357607464960, 727271714187522048, 727287317912817664, 727302414039158785, 727317768509480960, 727332108876705794, 727347714380419072, 727362055750176768, 727377660742123520, 727393264060534784, 727407354162122753, 727422705876762624, 727437555210293248, 727452651210809344, 727468761842876416, 727483610119413760, 727498961741856768, 727513051440762881, 727528910452305921, 727543248458149888, 727559107612422144, 727574712830857221, 727588550133288961, 727603646221914113, 727619000348348416 ] for startid, endid in zip(big_ben_ids[13:], big_ben_ids[14:]): for tweet in limit_handled( Cursor( api.search, q=' OR '.join( 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890' ), since_id=str(startid), max_id=str(endid), lang="en").items(2500)): print(json.dumps(tweet._json))
def init(): # Generate the configures in advance and cache them to avoid repeated processing global S3_BUCKET global S3_LOG_PREFIX global PATH_RE S3_BUCKET = staticconf.read_string('s3_bucket') S3_LOG_PREFIX = staticconf.read_string('s3_log_prefix') PATH_RE = re.compile(PATH_RE_PREFIX.format(S3_LOG_PREFIX))
def __init__(self, cluster: str, pool: str) -> None: super().__init__(cluster, pool) kubernetes.config.load_kube_config( staticconf.read_string(f'clusters.{cluster}.kubeconfig_path')) self._core_api = kubernetes.client.CoreV1Api() self._safe_to_evict_annotation = staticconf.read_string( f'clusters.{cluster}.pod_safe_to_evict_annotation', default='cluster-autoscaler.kubernetes.io/safe-to-evict', )
def __init__(self, cluster: str, pool: Optional[str]) -> None: super().__init__(cluster, pool) self.kubeconfig_path = staticconf.read_string( f'clusters.{cluster}.kubeconfig_path') self._safe_to_evict_annotation = staticconf.read_string( f'clusters.{cluster}.pod_safe_to_evict_annotation', default='cluster-autoscaler.kubernetes.io/safe-to-evict', ) self._nodes_by_ip = {}
def et_scanner_main(args): """ Create an instance of ETScanner and run it once. """ setup_config(args, 'ETScanner') sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name")) sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name")) scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'), sqs_scanner_queue, sqs_worker_queue, Mailer(args.run_local)) scanner.run()
def _init_session(): global _session if not _session: _session = boto3.session.Session( staticconf.read_string('accessKeyId', namespace=CREDENTIALS_NAMESPACE), staticconf.read_string('secretAccessKey', namespace=CREDENTIALS_NAMESPACE), region_name=staticconf.read_string('aws.region') )
def __init__(self, cluster_name: str) -> None: self.client = sqs self.cluster = cluster_name self.drain_queue_url = staticconf.read_string( f'clusters.{cluster_name}.drain_queue_url') self.termination_queue_url = staticconf.read_string( f'clusters.{cluster_name}.termination_queue_url') self.draining_host_ttl_cache: Dict[str, arrow.Arrow] = {} self.warning_queue_url = staticconf.read_string( f'clusters.{cluster_name}.warning_queue_url', default=None, )
def fetch_creds(): ''' Return a dictionary holding temporary credentials from the metadata server. This function will block upto the timeout specified in config file. You may not call this method unless config.yaml is loaded ''' url = '{url_root}/{name}'.format( url_root=staticconf.read_string('instance_profile_creds_url'), name=staticconf.read_string('instance_profile_name')) in_stream = urllib2.urlopen( url, timeout=staticconf.read_int( 'instance_profile_creds_timeout_in_seconds', default=4)) return simplejson.load(in_stream)
def rs_check_schema(rs_mgmt, args): yaml_data = load_from_file(args.schema) tables = RedShiftLogSchema(safe_load(yaml_data)).tables() db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') pipe_strm_lgr = PipelineStreamLogger( log_stream, True, 'rs_check_schema' ) psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True) rs_check_table_def(psql, db, tables, args.redshift_schema) rs_check_table_rows(psql, db, tables, args.redshift_schema)
def main(): """Connects to the stream and starts threads to write them to a file.""" staticconf.YamlConfiguration(CONFIG_FILE) listener = QueueListener() auth = OAuthHandler( staticconf.read_string('twitter.consumer_key'), staticconf.read_string('twitter.consumer_secret'), ) auth.set_access_token( staticconf.read_string('twitter.access_token'), staticconf.read_string('twitter.access_token_secret'), ) writer_thread = threading.Thread(target=worker, args=(listener,)) writer_thread.start() stream = Stream(auth, listener) print_status(listener) try: while True: try: stream.sample(languages=['en']) # blocking! except KeyboardInterrupt: print('KEYBOARD INTERRUPT', file=sys.stderr) return except (socket.error, httplib.HTTPException): global tcpip_delay print( 'TCP/IP Error: Restarting after {delay} seconds.'.format( delay=tcpip_delay, ), file=sys.stderr, ) time.sleep(min(tcpip_delay, MAX_TCPIP_TIMEOUT)) tcpip_delay += 0.25 finally: print('Disconnecting stream', file=sys.stderr) stream.disconnect() print('Waiting for last tweets to finish processing', file=sys.stderr) # Send poison pill to writer thread and wait for it to exit listener.queue.put(None) listener.queue.join() print('Waiting for writer thread to finish', file=sys.stderr) writer_thread.join() print('Exit successful', file=sys.stderr)
def main(): """Connects to the stream and starts threads to write them to a file.""" staticconf.YamlConfiguration(CONFIG_FILE) listener = QueueListener() auth = OAuthHandler( staticconf.read_string('twitter.consumer_key'), staticconf.read_string('twitter.consumer_secret'), ) auth.set_access_token( staticconf.read_string('twitter.access_token'), staticconf.read_string('twitter.access_token_secret'), ) writer_thread = threading.Thread(target=worker, args=(listener,)) writer_thread.start() stream = Stream(auth, listener) print_status(listener) try: while True: try: # stream.sample() # blocking! stream.filter(track=["#airpodsmax"]) except KeyboardInterrupt: print('KEYBOARD INTERRUPT', file=sys.stderr) return except (socket.error): global tcpip_delay print( 'TCP/IP Error: Restarting after {delay} seconds.'.format( delay=tcpip_delay, ), file=sys.stderr, ) time.sleep(min(tcpip_delay, MAX_TCPIP_TIMEOUT)) tcpip_delay += 0.25 finally: print('Disconnecting stream', file=sys.stderr) stream.disconnect() print('Waiting for last tweets to finish processing', file=sys.stderr) # Send poison pill to writer thread and wait for it to exit listener.queue.put(None) # listener.queue.join() print('Waiting for writer thread to finish', file=sys.stderr) writer_thread.join() print('Exit successful', file=sys.stderr)
def test_setup_config_with_env_vars(self): args = parse_cmd_args(['program', '--config=./config.yaml', '--config-override=config-env-dev.yaml', '-r']) with staticconf.testing.MockConfiguration(MOCK_CONFIG): setup_config(args, 'test_worker') # pick some key and ensure it ws loaded from config assert read_string('log_stream_name', 'default') != 'default'
def process_queues(cluster_name: str) -> None: draining_client = DrainingClient(cluster_name) mesos_master_url = staticconf.read_string( f'clusters.{cluster_name}.mesos_master_fqdn') mesos_secret_path = staticconf.read_string( f'mesos.mesos_agent_secret_path', default=None) operator_client = operator_api(mesos_master_url, mesos_secret_path) logger.info('Polling SQS for messages every 5s') while True: draining_client.clean_processing_hosts_cache() draining_client.process_warning_queue() draining_client.process_drain_queue( mesos_operator_client=operator_client, ) draining_client.process_termination_queue( mesos_operator_client=operator_client, ) time.sleep(5)
def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False): super(ImdWorker, self).__init__( config_loc, config_override_loc, emailer, num_processes=3, ) for key in self.KEYS_TO_LOAD: self.__setattr__(key, staticconf.read_string(key)) if dummy_run: log("Dummy worker! Skip the real etl process. Just for test.") import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates else: import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates self._should_run_local = run_local self.dummy_run = dummy_run self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main self.queue_name = staticconf.get_string("sqs.et_queue_name") self.scanner_queue_name = staticconf.get_string( "sqs.et_scanner_queue_name") log("ImdWorker initialization") log(dict((k, str(v)) for k, v in vars(self).iteritems()))
def run(self): while self.running: time.sleep(splay_event_time( self.run_interval, self.get_name() + staticconf.read_string('aws.region'), )) now = arrow.utcnow() with self.metrics_client.get_writer(METADATA) as writer: try: with suppress_request_limit_exceeded(): self.write_prices(now, writer) except socket.timeout: # We don't really care if we miss a few spot price changes so just continue here logger.warn(f'Timed out getting spot prices:\n\n{format_exc()}') continue # Report successful run to Sensu. sensu_args = dict( check_name='check_clusterman_spot_prices_running', output='OK: clusterman spot_prices was successful', check_every='1m', source=self.options.aws_region, ttl='10m', noop=self.options.disable_sensu, ) sensu_checkin(**sensu_args)
def fetch_creds(): ''' Return a dictionary holding temporary credentials from the metadata server. This function will block upto the timeout specified in config file. You may not call this method unless config.yaml is loaded ''' url = '{url_root}/{name}'.format( url_root=staticconf.read_string('instance_profile_creds_url'), name=staticconf.read_string('instance_profile_name')) in_stream = urllib2.urlopen( url, timeout=staticconf.read_int( 'instance_profile_creds_timeout_in_seconds', default=4 ) ) return simplejson.load(in_stream)
def test__get_key_name(): log_name = 'x' log_version = 'y' return_value = _get_key_name(log_name, log_version) s3_log_prefix = staticconf.read_string('s3_log_prefix') path_re = re.compile(PATH_RE_PREFIX.format(s3_log_prefix)) assert path_re.match(return_value) is not None
def configure_initial(self) -> None: setup_config(self.options) # Since we want to collect metrics for all the pools, we need to call setup_config # first to load the cluster config path, and then read all the entries in that directory self.pools: MutableMapping[str, List[str]] = {} for scheduler in {'mesos', 'kubernetes'}: self.pools[scheduler] = get_pool_name_list(self.options.cluster, scheduler) for scheduler, pools in self.pools.items(): for pool in pools: self.config.watchers.append({ f'{pool}.{scheduler}': get_pool_config_path(self.options.cluster, pool, scheduler), }) load_cluster_pool_config(self.options.cluster, pool, scheduler, None) self.region = staticconf.read_string('aws.region') self.run_interval = staticconf.read_int( 'batches.cluster_metrics.run_interval_seconds') self.logger = logger self.metrics_client = ClustermanMetricsBotoClient( region_name=self.region)
def _get_logger(run_local, tag): try: return PipelineStreamLogger(staticconf.read_string("log_stream_name"), run_local, tag) except: logger.write_msg("Error creating a pipeline stream logger!") return logger # Return existing logger instance in case of errors
def parse_config(self, config_file_path): super(S3Feeder, self).parse_config(config_file_path) self.s3_event_notifications_queue_name = staticconf.read( 's3_event_notifications_queue_name') self.number_messages = staticconf.read('number_messages', default=1) self.aws_region = staticconf.read('aws_region', default=None) self.owner_account_id = staticconf.read_string('owner_account_id') self.role_arn = staticconf.read('role_arn', default=None)
def get_scanner_queue(etl_type): """ Return the scanner sqs for jobs to send a message when post a job to wake up the scanner :param etl_type: et or load :type etl_type: string in ['et', 'load'] """ return SQSWrapper(read_string("sqs.{0}_scanner_queue_name".format(etl_type)))
def get_sqs_connection(): ''' :returns: sqs connection ''' return boto.sqs.connect_to_region( read_string('aws_config.region'), **get_boto_creds() )
def dynamodb_table_names(): ''' :returns: iterable of string that each element is a DyanmoDB table name used in mycroft ''' table_names = [] # append other table resources required by mycroft table_names.append(staticconf.read_string('aws_config.scheduled_jobs_table')) return table_names
def get_dynamodb_connection(): ''' :returns: dynamodb2 connection ''' return boto.dynamodb2.connect_to_region( read_string('aws_config.region'), **get_boto_creds() )
def main(): staticconf.YamlConfiguration(CONFIG_FILE) auth = OAuthHandler( staticconf.read_string('twitter.consumer_key'), staticconf.read_string('twitter.consumer_secret'), ) auth.set_access_token( staticconf.read_string('twitter.access_token'), staticconf.read_string('twitter.access_token_secret'), ) api = API(auth) big_ben_ids = [727256357607464960, 727271714187522048, 727287317912817664, 727302414039158785, 727317768509480960, 727332108876705794, 727347714380419072, 727362055750176768, 727377660742123520, 727393264060534784, 727407354162122753, 727422705876762624, 727437555210293248, 727452651210809344, 727468761842876416, 727483610119413760, 727498961741856768, 727513051440762881, 727528910452305921, 727543248458149888, 727559107612422144, 727574712830857221, 727588550133288961, 727603646221914113, 727619000348348416] for startid, endid in zip(big_ben_ids[13:], big_ben_ids[14:]): for tweet in limit_handled(Cursor(api.search, q=' OR '.join('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890'), since_id=str(startid), max_id=str(endid), lang="en").items(2500)): print(json.dumps(tweet._json))
def setup_config(args: argparse.Namespace) -> None: # load_default_config merges the 'module_config' key from the first file # and the 'module_env_config' key from the second file to configure packages. # This allows us to configure packages differently in different hiera envs by # changing 'module_env_config'. We use the same file for both keys. _load_module_configs(args.env_config_path) signals_branch_or_tag = getattr(args, 'signals_branch_or_tag', None) cluster_config_directory = getattr(args, 'cluster_config_directory', None) or DEFAULT_CLUSTER_DIRECTORY staticconf.DictConfiguration( {'cluster_config_directory': cluster_config_directory}) aws_region = getattr(args, 'aws_region', None) cluster = getattr(args, 'cluster', None) pool = getattr(args, 'pool', None) scheduler = getattr(args, 'scheduler', None) if aws_region and cluster: raise argparse.ArgumentError( None, 'Cannot specify both cluster and aws_region') # If there is a cluster specified via --cluster, load cluster-specific attributes # into staticconf. These values are not specified using hiera in srv-configs because # we might want to be operating on a cluster in one region while running from a # different region. elif cluster: aws_region = staticconf.read_string(f'clusters.{cluster}.aws_region', default=None) if pool: load_cluster_pool_config(cluster, pool, scheduler, signals_branch_or_tag) staticconf.DictConfiguration({'aws': {'region': aws_region}}) boto_creds_file = staticconf.read_string('aws.access_key_file', default=None) if boto_creds_file: staticconf.JSONConfiguration(boto_creds_file, namespace=CREDENTIALS_NAMESPACE) if signals_branch_or_tag: staticconf.DictConfiguration( {'autoscale_signal': { 'branch_or_tag': signals_branch_or_tag }})
def _get_logger(run_local, tag): try: return PipelineStreamLogger( staticconf.read_string("log_stream_name"), run_local, tag ) except: logger.write_msg("Error creating a pipeline stream logger!") return logger # Return existing logger instance in case of errors
def __init__(self, cluster: str, pool: str) -> None: super().__init__(cluster, pool) mesos_master_fqdn = staticconf.read_string(f'clusters.{self.cluster}.mesos_master_fqdn') self.non_batch_framework_prefixes = self.pool_config.read_list( 'non_batch_framework_prefixes', default=['marathon'], ) self.api_endpoint = f'http://{mesos_master_fqdn}:5050/' logger.info(f'Connecting to Mesos masters at {self.api_endpoint}')
def rs_cluster_restore(rs_mgmt, args): """ restore cluster from snapshot Output can be appended to a YAML config file """ if not args.subnet_group_name: args.subnet_group_name = read_string('redshift_cluster_subnet_group_name') if not args.vpc_security_group: args.vpc_security_group = read_string('security_group_id') rs_mgmt.restore_from_cluster_snapshot( args.cluster_name, args.snapshot, args.parameter_group, args.vpc_security_group, args.subnet_group_name, ) cluster_info = rs_mgmt.get_cluster_info(args.cluster_name) return cluster_info['Endpoint']['Address'], cluster_info['Endpoint']['Port']
def test_load_cluster_pool_config(cluster, pool, pool_other_config, mock_config_files): config.load_cluster_pool_config(cluster, pool, 'mesos', None) pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler='mesos') assert staticconf.read_int('other_config', namespace=pool_namespace) == pool_other_config assert staticconf.read_string(f'resource_groups', namespace=pool_namespace) == cluster
def test_setup_config_region(mock_load_module_configs, mock_config_files): args = argparse.Namespace( env_config_path='/nail/etc/config.yaml', aws_region='fake-region-A', ) config.setup_config(args) assert staticconf.read_string('aws.region') == 'fake-region-A' assert mock_load_module_configs.call_args == mock.call( '/nail/etc/config.yaml')
def ensure_account_id(cluster) -> None: current_account_id = sts.get_caller_identity()['Account'] cluster_account_id = staticconf.read_string( f'clusters.{cluster}.aws_account_number') if (current_account_id != cluster_account_id): raise AccountNumberMistmatchError( f'ACCOUNT ID MISMATCH! Current account id: {current_account_id}. Cluster account id: {cluster_account_id}' )
def setup_private(input_args): """ setup_private sets up the aws credentials required to run on the server in the appropriate environment variables Args: local -- True if we're on dev, False if on stageb input_args -- input yaml file with aws access_key_id and secret_access_key Returns a yaml file with the private information in it --- """ YamlConfiguration(input_args, optional=True) os.environ['AWS_ACCESS_KEY_ID'] = read_string('emr_aws_access_key_id') os.environ['AWS_SECRET_ACCESS_KEY'] = \ read_string('emr_aws_secret_access_key')
def get_scanner_queue(etl_type): """ Return the scanner sqs for jobs to send a message when post a job to wake up the scanner :param etl_type: et or load :type etl_type: string in ['et', 'load'] """ return SQSWrapper( read_string("sqs.{0}_scanner_queue_name".format(etl_type)))
def dynamodb_table_names(): ''' :returns: iterable of string that each element is a DyanmoDB table name used in mycroft ''' table_names = [] # append other table resources required by mycroft table_names.append( staticconf.read_string('aws_config.scheduled_jobs_table')) return table_names
def s3_to_psv_main(args): mrjob = read_string('pipeline.et_step.mrjob') stream_name = read_string('pipeline.et_step.s3_to_s3_stream') DATABASE = read_string('pipeline.redshift_database') LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, mrjob, input_date=args.date ) day_to_run = setup_dates_to_check(args.date, args.run_local, LOG_STREAM) try: if not args.run_local: setup_private(args.private) # Create a psql instance based on args if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable( LOG_STREAM, run_local=args.run_local ) else: status_table = RedshiftStatusTable( RedshiftPostgres( LOG_STREAM, args.private, run_local=args.run_local ) ) load_msg = __load_data_from_s3( status_table, read_list('pipeline.et_step.s3_prefixes'), day_to_run, mrjob, args.run_local, DATABASE, LOG_STREAM, force_et=args.force_et ) LOG_STREAM.write_msg("complete", extra_msg=load_msg) finally: clear_env(args.run_local)
def pipeline_yaml_schema_file_path(): """Return the full path of the yaml schema file for the pipeline. Do nothing if the path is already an S3 path """ yaml_schema_file_path = read_string('pipeline.yaml_schema_file') if is_s3_path(yaml_schema_file_path): return yaml_schema_file_path return '{directory}/{filename}'.format( directory=os.environ['YELPCODE'], filename=yaml_schema_file_path, )
def fetch_creds_from_file(): ''' Returns a dictionary holding credentials from a file defined in config.yaml ''' with open(staticconf.read_string('run_local.session_file'), 'r') as creds: if os.fstat(creds.fileno()).st_size == 0: raise Exception("session file is empty") creds_dict = simplejson.load(creds) creds_dict['Expiration'] = creds_dict.get('Expiration', MAX_UNIX_TIME) for optional_key in ['Token', 'LastUpdated']: creds_dict[optional_key] = creds_dict.get(optional_key) return creds_dict
def mail_result(self, final_status, msg, additional_info=None): link = self.link_temp.format(msg['uuid']) content = self.template.format( msg['uuid'], final_status, msg['log_name'], msg['log_schema_version'], msg['s3_path'], msg['redshift_id'], msg['start_date'], msg['end_date'], link, additional_info ) new_msg = MIMEText(content) new_msg['Subject'] = self.subject.format(msg['uuid']) new_msg['From'] = self.address new_msg['To'] = ','.join(msg['contact_emails']) smtp_host = staticconf.read_string('smtp_host', 'localhost') smtp_port = staticconf.read_string('smtp_port', None) smtp_login = staticconf.read_string('smtp_login', None) smtp_password = staticconf.read_string('smtp_password', None) smtp_security = staticconf.read_string('smtp_security', None) if smtp_port is not None: smtp_host = "{0}:{1}".format(smtp_host, smtp_port) if smtp_security is not None: smtp_security = smtp_security.upper() if smtp_security == 'SSL': s = smtplib.SMTP_SSL(smtp_host) s.login(smtp_login, smtp_password) elif smtp_security == 'TLS': s = smtplib.SMTP(smtp_host) s.ehlo() s.starttls() s.login(smtp_login, smtp_password) else: s = smtplib.SMTP(smtp_host) s.sendmail(self.address, msg['contact_emails'], new_msg.as_string()) s.quit()
def search_log_source_by_keyword(request_body): disabled_logfinder = staticconf.read_bool('disable_logfinder_service') if disabled_logfinder: return {'logs': []} # send HTTP request search_endpoint = staticconf.read_string('log_finder_search_end_point') response = requests.post(search_endpoint, request_body) # if we get a bad HTTP status, raise an exception response.raise_for_status() content = response.json() return content
def __init__(self, logstrm, psql_auth_file, run_local=False): self.run_local = run_local self.host = staticconf.read_string('redshift_host') self.port = staticconf.read_int('redshift_port') private_dict = YamlConfiguration(psql_auth_file) self.user = private_dict['redshift_user'] self.password = private_dict['redshift_password'] self.log_stream = logstrm self._aws_key = '' self._aws_secret = '' self._aws_token = '' self._aws_token_expiry = datetime.utcnow() self._whitelist = ['select', 'create', 'insert', 'update'] self._set_aws_auth() psycopg2.extensions.set_wait_callback(wait_select_inter)
def get_log_meta_data(bucket_name, log_name): if bucket_name is None or log_name is None: return None if staticconf.read_bool('disable_logfinder_service'): return None # send HTTP request endpoint = staticconf.read_string('log_finder_buckets_end_point') \ + '/' + bucket_name + '/' + log_name response = requests.get(endpoint) # if we get a bad HTTP status, raise an exception response.raise_for_status() return response.json()
def copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream): s3_log, rs_table = log_tuple namespaced_table_name = get_namespaced_tablename(rs_table) table_start = time.time() extra_msg = "from s3 log: {0}".format(s3_log) logstream.write_msg('starting', extra_msg=extra_msg) # about to load new day, remove oldest rows_deleted = None if ttl_days is not None: rows_deleted = \ delete_old_data(psql_helper, db_name, rs_table, ttl_days - 1) if rows_deleted: logstream.write_msg('delete_ok', extra_msg="{0} rows".format(rows_deleted)) # Try to reclaim disk space. If not needed, it will be fast. # Calling here and not in the 'if rows_deleted' code to prevent # scenario where rows were deleted but compact failed. Then on retry # there will be nothing to delete but since space is not reclaimed # there may not be enough for a new load, resulting in failure forever. if ttl_days is not None: compact_table(psql_helper, db_name, namespaced_table_name) delimiter = read_string('redshift_column_delimiter') delimiter = delimiter.decode("string_escape") if delimiter not in string.printable: delimiter = '\\' + oct(ord(delimiter)) copy_sql = LOAD % (namespaced_table_name, s3_log, delimiter) result = psql_helper.run_sql( copy_sql, db_name, " copying from " + s3_log, s3_needed=True, time_est_secs=read_int('pipeline.load_step.copy_time_est_secs') ) if result is not False: logstream.write_msg('complete', job_start_secs=table_start, extra_msg=extra_msg) return result
def get_connection(cls, table_object_name): if table_object_name not in cls._connection_dict: if cls._region_conn is None: cls._region_conn = get_dynamodb_connection() table_properties = cls._TABLE_NAME_TO_PROPERTIES[table_object_name] avro_schema = get_avro_schema(table_properties['avro_schema']) table_name = read_string(table_properties['physical_id_key']) table = Table( table_name, connection=cls._region_conn ) try: results = table.describe() raw_indexes = results['Table'].get('GlobalSecondaryIndexes', []) table.global_indexes = introspect_global_indexes(raw_indexes) except Exception: log_exception("Table Connection Failed") cls._connection_dict[table_object_name] = table_properties['class']( table, avro_schema ) return cls._connection_dict[table_object_name]
def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False): super(ImdWorker, self).__init__( config_loc, config_override_loc, emailer, num_processes=3, ) for key in self.KEYS_TO_LOAD: self.__setattr__(key, staticconf.read_string(key)) if dummy_run: log("Dummy worker! Skip the real etl process. Just for test.") import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates else: import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates self._should_run_local = run_local self.dummy_run = dummy_run self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main self.queue_name = staticconf.get_string("sqs.et_queue_name") self.scanner_queue_name = staticconf.get_string("sqs.et_scanner_queue_name") log("ImdWorker initialization") log(dict((k, str(v))for k, v in vars(self).iteritems()))
def get_redshift_schema(): # note we do lower for backward compatability return read_string('redshift_schema', DEFAULT_NAMESPACE).lower()
def s3_to_redshift_main(args): db = read_string('pipeline.redshift_database') s3_log_prefix = read_string('pipeline.s3_output_prefix').format( logname=os.environ.get('LOGNAME', 'unknown') ) # setup logging stream_name = read_string('pipeline.load_step.s3_to_redshift_stream') LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load' ) # handle to redshift db loader_psql = RedshiftPostgres( LOG_STREAM, args.private, run_local=args.run_local ) if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable( LOG_STREAM, run_local=args.run_local ) else: status_table = RedshiftStatusTable(loader_psql) create_tuples = get_table_creates(args.db_file, LOG_STREAM) data_candidates = dates_from_rs_status( status_table, db, LOG_STREAM, args.retry_errors, args.date, ) if data_candidates: try: update_database_schema( loader_psql, db, data_candidates[0], s3_log_prefix, args.db_file, LOG_STREAM ) except Exception as e: status_table.update_status( db, data_candidates[0], get_yaml_table_versions(pipeline_yaml_schema_file_path()), "error", start_time_secs=time.time(), error_msg=repr(e) ) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date ) logs_to_copy = [ (join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples ] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures) ) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)