def test_create_emr_args(input_date, dev, cores, pipeline_yaml): print "just starting" load_package_config('config.yaml') YamlConfiguration(pipeline_yaml) input_prefix = read_list('pipeline.et_step.s3_prefixes')[0] input_file = input_prefix + input_date + '/part-*.gz' expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS expected_out_file = read_string('pipeline.s3_output_prefix') delimiter = read_string('redshift_column_delimiter') with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}): logname = os.environ['LOGNAME'] expected_out_file = os.path.join( expected_out_file.format(logname=logname), input_date ) extractions = pipeline_yaml_schema_file_path() formatted_args = expected_args.format(input_file, expected_out_file, cores, extractions, delimiter) output_under_test = create_emr_args(input_date, 10, input_prefix, dev) assert output_under_test == formatted_args
def test_create_emr_args(input_date, dev, cores, pipeline_yaml): print "just starting" load_package_config('config.yaml') YamlConfiguration(pipeline_yaml) input_prefix = read_list('pipeline.et_step.s3_prefixes')[0] input_file = input_prefix + input_date + '/part-*.gz' expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS expected_out_file = read_string('pipeline.s3_output_prefix') delimiter = read_string('redshift_column_delimiter') with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}): logname = os.environ['LOGNAME'] expected_out_file = os.path.join( expected_out_file.format(logname=logname), input_date) extractions = pipeline_yaml_schema_file_path() formatted_args = expected_args.format(input_file, expected_out_file, cores, extractions, delimiter) output_under_test = create_emr_args(input_date, 10, input_prefix, dev) assert output_under_test == formatted_args
elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date ) logs_to_copy = [ (join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples ] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM) if __name__ == '__main__': args_namespace = parse_command_line(sys.argv) load_package_config(args_namespace.config) YamlConfiguration(args_namespace.io_yaml, optional=False) if args_namespace.config_override: YamlConfiguration(args_namespace.config_override, optional=False) s3_to_redshift_main(args_namespace)
start_time_secs=time.time(), error_msg=repr(e)) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date) logs_to_copy = [(join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM) if __name__ == '__main__': args_namespace = parse_command_line(sys.argv) load_package_config(args_namespace.config) YamlConfiguration(args_namespace.io_yaml, optional=False) if args_namespace.config_override: YamlConfiguration(args_namespace.config_override, optional=False) s3_to_redshift_main(args_namespace)
def merge_configs(fname_list): base_fname = fname_list.pop(0) config = load_package_config(base_fname) for fname in fname_list: YamlConfiguration(fname, optional=True) return config