def test_create_emr_args(input_date, dev, cores, pipeline_yaml): print "just starting" load_package_config('config.yaml') YamlConfiguration(pipeline_yaml) input_prefix = read_list('pipeline.et_step.s3_prefixes')[0] input_file = input_prefix + input_date + '/part-*.gz' expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS expected_out_file = read_string('pipeline.s3_output_prefix') delimiter = read_string('redshift_column_delimiter') with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}): logname = os.environ['LOGNAME'] expected_out_file = os.path.join( expected_out_file.format(logname=logname), input_date ) extractions = pipeline_yaml_schema_file_path() formatted_args = expected_args.format(input_file, expected_out_file, cores, extractions, delimiter) output_under_test = create_emr_args(input_date, 10, input_prefix, dev) assert output_under_test == formatted_args
def test_pipeline_yaml_schema_file_path_with_s3(): filename = 's3://bucket/key/filename.yaml' config = { 'pipeline.yaml_schema_file': filename } with staticconf.testing.MockConfiguration(config): assert pipeline_yaml_schema_file_path() == filename
def create_emr_args(date_with_slashes, cores, infile_prefix, local): """creates a string containing arguments for mr job inputs: date_with_slashes -- a date string of the form 'YYYY/MM/DD' cores -- the number of cores to use for a conversion infile_prefix -- the prefix to the search bucket delimiter -- column delimiter for S3 output outputs: string containing arguments used by ET mr job""" input_file = infile_prefix + date_with_slashes +\ read_string('pipeline.et_step.s3_input_suffix') user_prefix = get_s3_output_user_prefix() output_file = os.path.join(user_prefix, date_with_slashes) if int(cores) > MAX_CORES: cores = MAX_CORES extractions = pipeline_yaml_schema_file_path() delimiter = read_string('redshift_column_delimiter') if local: template = read_string('run_local.mrjob_arg_template') else: template = read_string('run_service.mrjob_arg_template') return template.format( input_file, output_file, cores, extractions, delimiter )
def create_emr_args(date_with_slashes, cores, infile_prefix, local): """creates a string containing arguments for mr job inputs: date_with_slashes -- a date string of the form 'YYYY/MM/DD' cores -- the number of cores to use for a conversion infile_prefix -- the prefix to the search bucket delimiter -- column delimiter for S3 output outputs: string containing arguments used by ET mr job""" input_file = infile_prefix + date_with_slashes +\ read_string('pipeline.et_step.s3_input_suffix') user_prefix = get_s3_output_user_prefix() output_file = os.path.join(user_prefix, date_with_slashes) if int(cores) > MAX_CORES: cores = MAX_CORES extractions = pipeline_yaml_schema_file_path() delimiter = read_string('redshift_column_delimiter') if local: template = read_string('run_local.mrjob_arg_template') else: template = read_string('run_service.mrjob_arg_template') return template.format(input_file, output_file, cores, extractions, delimiter)
def dates_from_rs_status(status_helper, db, logstream, retry_on_err, single_date=None): """ date_from_rs_status gets the jobs that have completed the et step, but have not started the load step, and have no jobs before them running or in error Args: status_helper -- a wrapper around a backing store to aid in CRUD db -- is the database we query logstream -- a PipelineStreamLogger retry_on_err -- a boolean, True if we're retrying on errors single_date -- date string of the form YYYY-MM-DD if we're \ only looking for one Returns: a list of dates to catch up on formatted as strings YYYY/MM/DD """ versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) if single_date is not None: data_date = get_formatted_date(single_date) if data_date is None: handle_error("bad input date: {0}".format(single_date), logstream) start_datetime = datetime.strptime(data_date, "%Y/%m/%d") status_tuples = \ status_helper.query_et_complete_job(db, versions, data_date) else: days_back = read_int('pipeline.load_step.days_to_check') + 1 start_datetime = datetime.utcnow() - timedelta(days=days_back) status_tuples = \ status_helper.query_et_complete_jobs(db, versions, start_datetime) if status_tuples is False: handle_error( "query for complete et job failed, version={0}, date={1}".format( versions, data_date if single_date is not None else start_datetime), logstream) candidates = [] last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d") for ddate, ld_status in status_tuples: if not one_day_greater(ddate, last_date): break elif ld_status is None or (ld_status == 'error' and retry_on_err): candidates.append(ddate) elif ld_status == 'error': break last_date = ddate candidate_string = "candidates dates for load: {0}".format(candidates) logstream.write_msg(status='running', extra_msg=candidate_string) return candidates
def test_pipeline_yaml_schema_file_path(): directory = 'directory' filename = 'filename.yaml' config = {'pipeline.yaml_schema_file': filename} with mock.patch.dict('os.environ', {'YELPCODE': directory}): with staticconf.testing.MockConfiguration(config): assert pipeline_yaml_schema_file_path() == \ '{directory}/{filename}'.format( directory=directory, filename=filename)
def dates_from_rs_status(status_helper, db, logstream, retry_on_err, single_date=None): """ date_from_rs_status gets the jobs that have completed the et step, but have not started the load step, and have no jobs before them running or in error Args: status_helper -- a wrapper around a backing store to aid in CRUD db -- is the database we query logstream -- a PipelineStreamLogger retry_on_err -- a boolean, True if we're retrying on errors single_date -- date string of the form YYYY-MM-DD if we're \ only looking for one Returns: a list of dates to catch up on formatted as strings YYYY/MM/DD """ versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) if single_date is not None: data_date = get_formatted_date(single_date) if data_date is None: handle_error("bad input date: {0}".format(single_date), logstream) start_datetime = datetime.strptime(data_date, "%Y/%m/%d") status_tuples = \ status_helper.query_et_complete_job(db, versions, data_date) else: days_back = read_int('pipeline.load_step.days_to_check') + 1 start_datetime = datetime.utcnow() - timedelta(days=days_back) status_tuples = \ status_helper.query_et_complete_jobs(db, versions, start_datetime) if status_tuples is False: handle_error( "query for complete et job failed, version={0}, date={1}".format( versions, data_date if single_date is not None else start_datetime ), logstream ) candidates = [] last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d") for ddate, ld_status in status_tuples: if not one_day_greater(ddate, last_date): break elif ld_status is None or (ld_status == 'error' and retry_on_err): candidates.append(ddate) elif ld_status == 'error': break last_date = ddate candidate_string = "candidates dates for load: {0}".format(candidates) logstream.write_msg(status='running', extra_msg=candidate_string) return candidates
def test_pipeline_yaml_schema_file_path(): directory = 'directory' filename = 'filename.yaml' config = { 'pipeline.yaml_schema_file': filename } with mock.patch.dict('os.environ', {'YELPCODE': directory}): with staticconf.testing.MockConfiguration(config): assert pipeline_yaml_schema_file_path() == \ '{directory}/{filename}'.format( directory=directory, filename=filename)
def test_create_emr_args(input_date, dev, cores, pipeline_yaml): print "just starting" load_package_config('config.yaml') YamlConfiguration(pipeline_yaml) input_prefix = read_list('pipeline.et_step.s3_prefixes')[0] input_file = input_prefix + input_date + '/part-*.gz' expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS expected_out_file = read_string('pipeline.s3_output_prefix') delimiter = read_string('redshift_column_delimiter') with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}): logname = os.environ['LOGNAME'] expected_out_file = os.path.join( expected_out_file.format(logname=logname), input_date) extractions = pipeline_yaml_schema_file_path() formatted_args = expected_args.format(input_file, expected_out_file, cores, extractions, delimiter) output_under_test = create_emr_args(input_date, 10, input_prefix, dev) assert output_under_test == formatted_args
def s3_to_redshift_main(args): db = read_string('pipeline.redshift_database') s3_log_prefix = read_string('pipeline.s3_output_prefix').format( logname=os.environ.get('LOGNAME', 'unknown') ) # setup logging stream_name = read_string('pipeline.load_step.s3_to_redshift_stream') LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load' ) # handle to redshift db loader_psql = RedshiftPostgres( LOG_STREAM, args.private, run_local=args.run_local ) if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable( LOG_STREAM, run_local=args.run_local ) else: status_table = RedshiftStatusTable(loader_psql) create_tuples = get_table_creates(args.db_file, LOG_STREAM) data_candidates = dates_from_rs_status( status_table, db, LOG_STREAM, args.retry_errors, args.date, ) if data_candidates: try: update_database_schema( loader_psql, db, data_candidates[0], s3_log_prefix, args.db_file, LOG_STREAM ) except Exception as e: status_table.update_status( db, data_candidates[0], get_yaml_table_versions(pipeline_yaml_schema_file_path()), "error", start_time_secs=time.time(), error_msg=repr(e) ) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger( stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date ) logs_to_copy = [ (join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples ] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM)
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only( exc_type, exc_value )[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate ) status_helper.update_status( db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg ) handle_error(error_msg, logstream) status_helper.update_status( db_name, ddate, yaml_versions, "complete", start_time_secs=start )
def test_pipeline_yaml_schema_file_path_with_s3(): filename = 's3://bucket/key/filename.yaml' config = {'pipeline.yaml_schema_file': filename} with staticconf.testing.MockConfiguration(config): assert pipeline_yaml_schema_file_path() == filename
def s3_to_redshift_main(args): db = read_string('pipeline.redshift_database') s3_log_prefix = read_string('pipeline.s3_output_prefix').format( logname=os.environ.get('LOGNAME', 'unknown')) # setup logging stream_name = read_string('pipeline.load_step.s3_to_redshift_stream') LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load') # handle to redshift db loader_psql = RedshiftPostgres(LOG_STREAM, args.private, run_local=args.run_local) if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable(LOG_STREAM, run_local=args.run_local) else: status_table = RedshiftStatusTable(loader_psql) create_tuples = get_table_creates(args.db_file, LOG_STREAM) data_candidates = dates_from_rs_status( status_table, db, LOG_STREAM, args.retry_errors, args.date, ) if data_candidates: try: update_database_schema(loader_psql, db, data_candidates[0], s3_log_prefix, args.db_file, LOG_STREAM) except Exception as e: status_table.update_status(db, data_candidates[0], get_yaml_table_versions( pipeline_yaml_schema_file_path()), "error", start_time_secs=time.time(), error_msg=repr(e)) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date) logs_to_copy = [(join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM)
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples, ttl_days, logstream): """ copy_tables takes a list of input log, table pairs and copies each input log to its corresponding input table Args: psql_helper -- a RedshiftPostgres object to help perform the copy status_helper -- An object handle to interact with status table db_name -- the name of the db to which we're copying ddate -- the date string of the data to be copied formatted YYYY/MM/DD log_tuples -- a list of (log, table) pairs ttl_days -- how many days to retain loaded data logstream -- a PipelineStreamLogger Returns: --- """ start = time.time() yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) status_helper.update_status(db_name, ddate, yaml_versions, "running") err_tbl_name, _ = RedShiftLogSchema().get_error_table() for log_tuple in log_tuples: result = False error_msg = None try: result = copy_table(psql_helper, db_name, ddate, log_tuple, ttl_days, logstream) except KeyboardInterrupt: result = None raise except Exception: exc_type, exc_value, exc_tb = sys.exc_info() error_msg = "{0}".format({ 'crash_tb': ''.join(traceback.format_tb(exc_tb)), 'crash_exc': traceback.format_exception_only(exc_type, exc_value)[0].strip() }) # ignore copy error if error table does not exist s3_log, rs_table = log_tuple if rs_table == err_tbl_name and \ exc_value.args[0].find('The specified S3 prefix') != -1 and \ exc_value.args[0].find('does not exist') != -1: result = None finally: if result is False: _, rs_table = log_tuple if error_msg is None: error_msg = "failed copy {0} for date: {1}".format( get_namespaced_tablename(rs_table), ddate) status_helper.update_status(db_name, ddate, yaml_versions, "error", start_time_secs=start, error_msg=error_msg) handle_error(error_msg, logstream) status_helper.update_status(db_name, ddate, yaml_versions, "complete", start_time_secs=start)
def __load_data_from_s3( status_helper, prefixes, date_with_slashes, mrjob_path, local, db_name, logstream, force_et=False ): """ load_data_from_s3 iterates over prefixes and loads data for a particular date for the first prefix where the data exists. It also checks whether data has already been loaded for a date and if so, skips the load Args: status_helper -- An object handle to interact with status table prefixes -- a list of s3 prefixes for input data date_with_slashes -- a date string of the form 'YYYY/MM/DD' mrjob_path -- module.entry_point of the job to extract and \ transform the data local -- True if we're running locally (i.e., devc) False for aws instance logstream -- a PipelineStreamLogger Returns: --- """ start_time = time.time() table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) conditions = { 'table_versions': table_versions, 'data_date': date_with_slashes } if status_helper.et_started(conditions, db_name): logstream.write_msg( "complete", extra_msg="skipping: et_step already started" ) return prefix_for_this_data = get_next_dir_to_load( prefixes, date_with_slashes, local, logstream, force_et ) if not prefix_for_this_data: jobtime = 0 err_msg = "no prefix available date={0}, prefixes={1}".format( date_with_slashes, prefixes ) logstream.write_msg("error", error_msg=err_msg) status_helper.log_status_result( conditions, jobtime, db_name, failed=True, err_msg=err_msg ) raise Exception(err_msg) # check if mrjob is already done data_we_check = "{0} {1} {2}".format( get_s3_output_user_prefix(), date_with_slashes, local ) logstream.write_msg("running", extra_msg=data_we_check) if data_available( get_s3_output_user_prefix(), date_with_slashes, local, done_file_name='_SUCCESS' ): logstream.write_msg( "complete", extra_msg="skipping: et_step already done" ) return jobtime = time.time() mrjob_args = create_emr_args( date_with_slashes, read_int('pipeline.et_step.cores'), prefix_for_this_data, local ) status_helper.insert_et(conditions, db_name) logstream.write_msg("running", extra_msg=mrjob_args) result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream) failed = not result jobtime = time.time() - start_time status_helper.log_status_result( conditions, jobtime, db_name, failed=failed, err_msg=err_reason ) if failed: raise Exception(err_reason) return
def __load_data_from_s3(status_helper, prefixes, date_with_slashes, mrjob_path, local, db_name, logstream, force_et=False): """ load_data_from_s3 iterates over prefixes and loads data for a particular date for the first prefix where the data exists. It also checks whether data has already been loaded for a date and if so, skips the load Args: status_helper -- An object handle to interact with status table prefixes -- a list of s3 prefixes for input data date_with_slashes -- a date string of the form 'YYYY/MM/DD' mrjob_path -- module.entry_point of the job to extract and \ transform the data local -- True if we're running locally (i.e., devc) False for aws instance logstream -- a PipelineStreamLogger Returns: --- """ start_time = time.time() table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path()) conditions = { 'table_versions': table_versions, 'data_date': date_with_slashes } if status_helper.et_started(conditions, db_name): logstream.write_msg("complete", extra_msg="skipping: et_step already started") return prefix_for_this_data = get_next_dir_to_load(prefixes, date_with_slashes, local, logstream, force_et) if not prefix_for_this_data: jobtime = 0 err_msg = "no prefix available date={0}, prefixes={1}".format( date_with_slashes, prefixes) logstream.write_msg("error", error_msg=err_msg) status_helper.log_status_result(conditions, jobtime, db_name, failed=True, err_msg=err_msg) raise Exception(err_msg) # check if mrjob is already done data_we_check = "{0} {1} {2}".format(get_s3_output_user_prefix(), date_with_slashes, local) logstream.write_msg("running", extra_msg=data_we_check) if data_available(get_s3_output_user_prefix(), date_with_slashes, local, done_file_name='_SUCCESS'): logstream.write_msg("complete", extra_msg="skipping: et_step already done") return jobtime = time.time() mrjob_args = create_emr_args(date_with_slashes, read_int('pipeline.et_step.cores'), prefix_for_this_data, local) status_helper.insert_et(conditions, db_name) logstream.write_msg("running", extra_msg=mrjob_args) result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream) failed = not result jobtime = time.time() - start_time status_helper.log_status_result(conditions, jobtime, db_name, failed=failed, err_msg=err_reason) if failed: raise Exception(err_reason) return