def _get_logger(run_local, tag): try: return PipelineStreamLogger(staticconf.read_string("log_stream_name"), run_local, tag) except: logger.write_msg("Error creating a pipeline stream logger!") return logger # Return existing logger instance in case of errors
def test_pipeline_json(): logger = PipelineStreamLogger(None, False, 'test') now = time.time() js = logger._pipeline_json('complete', error_msg='error!', extra_msg='extra', job_start_secs=now) json_dict = simplejson.loads(js) assert json_dict['msg']['status'] == 'complete' assert json_dict['msg']['additional_info'] == 'extra' # if the above lines of code take longer than 10 seconds something's wrong assert json_dict['msg']['job_time'] < 10 assert json_dict['tag'] == 'test'
def rs_check_schema(rs_mgmt, args): yaml_data = load_from_file(args.schema) tables = RedShiftLogSchema(safe_load(yaml_data)).tables() db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') pipe_strm_lgr = PipelineStreamLogger( log_stream, True, 'rs_check_schema' ) psql = RedshiftPostgres(pipe_strm_lgr, args.credentials, run_local=True) rs_check_table_def(psql, db, tables, args.redshift_schema) rs_check_table_rows(psql, db, tables, args.redshift_schema)
def s3_to_psv_main(args): mrjob = read_string('pipeline.et_step.mrjob') stream_name = read_string('pipeline.et_step.s3_to_s3_stream') DATABASE = read_string('pipeline.redshift_database') LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, mrjob, input_date=args.date) day_to_run = setup_dates_to_check(args.date, args.run_local, LOG_STREAM) try: if not args.run_local: setup_private(args.private) # Create a psql instance based on args if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable(LOG_STREAM, run_local=args.run_local) else: status_table = RedshiftStatusTable( RedshiftPostgres(LOG_STREAM, args.private, run_local=args.run_local)) load_msg = __load_data_from_s3( status_table, read_list('pipeline.et_step.s3_prefixes'), day_to_run, mrjob, args.run_local, DATABASE, LOG_STREAM, force_et=args.force_et) LOG_STREAM.write_msg("complete", extra_msg=load_msg) finally: clear_env(args.run_local)
def test_handle_error(input_value): test_logger = PipelineStreamLogger("test_stream", False, "test_tag") with pytest.raises(Exception): handle_error(input_value, test_logger)
def s3_to_redshift_main(args): db = read_string('pipeline.redshift_database') s3_log_prefix = read_string('pipeline.s3_output_prefix').format( logname=os.environ.get('LOGNAME', 'unknown')) # setup logging stream_name = read_string('pipeline.load_step.s3_to_redshift_stream') LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load') # handle to redshift db loader_psql = RedshiftPostgres(LOG_STREAM, args.private, run_local=args.run_local) if args.skip_progress_in_redshift: status_table = DynamoDbStatusTable(LOG_STREAM, run_local=args.run_local) else: status_table = RedshiftStatusTable(loader_psql) create_tuples = get_table_creates(args.db_file, LOG_STREAM) data_candidates = dates_from_rs_status( status_table, db, LOG_STREAM, args.retry_errors, args.date, ) if data_candidates: try: update_database_schema(loader_psql, db, data_candidates[0], s3_log_prefix, args.db_file, LOG_STREAM) except Exception as e: status_table.update_status(db, data_candidates[0], get_yaml_table_versions( pipeline_yaml_schema_file_path()), "error", start_time_secs=time.time(), error_msg=repr(e)) raise elif args.date is not None: raise IOError("{0} data is either already loaded \ or has not yet completed ET step".format(args.date)) logs_to_copy = [] for input_date in data_candidates: LOG_STREAM = PipelineStreamLogger(stream_name, args.run_local, 's3_to_redshift', job_name='load', input_date=input_date) logs_to_copy = [(join(s3_log_prefix, input_date, table), table) for (table, _) in create_tuples] copy_tables(loader_psql, status_table, db, input_date, logs_to_copy, args.ttl_days, LOG_STREAM)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures)) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)