示例#1
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname),
            input_date
        )
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file,
                                              expected_out_file,
                                              cores,
                                              extractions,
                                              delimiter)
        output_under_test = create_emr_args(input_date, 10,
                                            input_prefix, dev)
        assert output_under_test == formatted_args
示例#2
0
def test_pipeline_yaml_schema_file_path_with_s3():
    filename = 's3://bucket/key/filename.yaml'
    config = {
        'pipeline.yaml_schema_file': filename
    }
    with staticconf.testing.MockConfiguration(config):
        assert pipeline_yaml_schema_file_path() == filename
示例#3
0
文件: s3_to_psv.py 项目: Yelp/mycroft
def create_emr_args(date_with_slashes, cores, infile_prefix, local):
    """creates a string containing arguments for mr job

    inputs:
        date_with_slashes -- a date string of the form 'YYYY/MM/DD'
        cores -- the number of cores to use for a conversion
        infile_prefix -- the prefix to the search bucket
        delimiter -- column delimiter for S3 output

    outputs:
        string containing arguments used by ET mr job"""

    input_file = infile_prefix + date_with_slashes +\
        read_string('pipeline.et_step.s3_input_suffix')
    user_prefix = get_s3_output_user_prefix()
    output_file = os.path.join(user_prefix, date_with_slashes)

    if int(cores) > MAX_CORES:
        cores = MAX_CORES

    extractions = pipeline_yaml_schema_file_path()
    delimiter = read_string('redshift_column_delimiter')
    if local:
        template = read_string('run_local.mrjob_arg_template')
    else:
        template = read_string('run_service.mrjob_arg_template')

    return template.format(
        input_file, output_file, cores, extractions, delimiter
    )
示例#4
0
def create_emr_args(date_with_slashes, cores, infile_prefix, local):
    """creates a string containing arguments for mr job

    inputs:
        date_with_slashes -- a date string of the form 'YYYY/MM/DD'
        cores -- the number of cores to use for a conversion
        infile_prefix -- the prefix to the search bucket
        delimiter -- column delimiter for S3 output

    outputs:
        string containing arguments used by ET mr job"""

    input_file = infile_prefix + date_with_slashes +\
        read_string('pipeline.et_step.s3_input_suffix')
    user_prefix = get_s3_output_user_prefix()
    output_file = os.path.join(user_prefix, date_with_slashes)

    if int(cores) > MAX_CORES:
        cores = MAX_CORES

    extractions = pipeline_yaml_schema_file_path()
    delimiter = read_string('redshift_column_delimiter')
    if local:
        template = read_string('run_local.mrjob_arg_template')
    else:
        template = read_string('run_service.mrjob_arg_template')

    return template.format(input_file, output_file, cores, extractions,
                           delimiter)
示例#5
0
def dates_from_rs_status(status_helper,
                         db,
                         logstream,
                         retry_on_err,
                         single_date=None):
    """
    date_from_rs_status gets the jobs that have completed the et step, but
    have not started the load step, and have no jobs before them running or
    in error

    Args:
    status_helper -- a wrapper around a backing store to aid in CRUD
    db -- is the database we query
    logstream -- a PipelineStreamLogger
    retry_on_err -- a boolean, True if we're retrying on errors
    single_date -- date string of the form YYYY-MM-DD if we're \
        only looking for one

    Returns:
    a list of dates to catch up on formatted as strings YYYY/MM/DD
    """
    versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())

    if single_date is not None:
        data_date = get_formatted_date(single_date)
        if data_date is None:
            handle_error("bad input date: {0}".format(single_date), logstream)
        start_datetime = datetime.strptime(data_date, "%Y/%m/%d")
        status_tuples = \
            status_helper.query_et_complete_job(db, versions, data_date)
    else:
        days_back = read_int('pipeline.load_step.days_to_check') + 1
        start_datetime = datetime.utcnow() - timedelta(days=days_back)
        status_tuples = \
            status_helper.query_et_complete_jobs(db, versions, start_datetime)

    if status_tuples is False:
        handle_error(
            "query for complete et job failed, version={0}, date={1}".format(
                versions,
                data_date if single_date is not None else start_datetime),
            logstream)

    candidates = []
    last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d")
    for ddate, ld_status in status_tuples:
        if not one_day_greater(ddate, last_date):
            break
        elif ld_status is None or (ld_status == 'error' and retry_on_err):
            candidates.append(ddate)
        elif ld_status == 'error':
            break
        last_date = ddate
    candidate_string = "candidates dates for load: {0}".format(candidates)
    logstream.write_msg(status='running', extra_msg=candidate_string)
    return candidates
示例#6
0
def test_pipeline_yaml_schema_file_path():
    directory = 'directory'
    filename = 'filename.yaml'
    config = {'pipeline.yaml_schema_file': filename}
    with mock.patch.dict('os.environ', {'YELPCODE': directory}):
        with staticconf.testing.MockConfiguration(config):
            assert pipeline_yaml_schema_file_path() == \
                '{directory}/{filename}'.format(
                    directory=directory,
                    filename=filename)
示例#7
0
def dates_from_rs_status(status_helper, db, logstream,
                         retry_on_err, single_date=None):
    """
    date_from_rs_status gets the jobs that have completed the et step, but
    have not started the load step, and have no jobs before them running or
    in error

    Args:
    status_helper -- a wrapper around a backing store to aid in CRUD
    db -- is the database we query
    logstream -- a PipelineStreamLogger
    retry_on_err -- a boolean, True if we're retrying on errors
    single_date -- date string of the form YYYY-MM-DD if we're \
        only looking for one

    Returns:
    a list of dates to catch up on formatted as strings YYYY/MM/DD
    """
    versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())

    if single_date is not None:
        data_date = get_formatted_date(single_date)
        if data_date is None:
            handle_error("bad input date: {0}".format(single_date), logstream)
        start_datetime = datetime.strptime(data_date, "%Y/%m/%d")
        status_tuples = \
            status_helper.query_et_complete_job(db, versions, data_date)
    else:
        days_back = read_int('pipeline.load_step.days_to_check') + 1
        start_datetime = datetime.utcnow() - timedelta(days=days_back)
        status_tuples = \
            status_helper.query_et_complete_jobs(db, versions, start_datetime)

    if status_tuples is False:
        handle_error(
            "query for complete et job failed, version={0}, date={1}".format(
                versions,
                data_date if single_date is not None else start_datetime
            ),
            logstream
        )

    candidates = []
    last_date = (start_datetime - timedelta(days=1)).strftime("%Y/%m/%d")
    for ddate, ld_status in status_tuples:
        if not one_day_greater(ddate, last_date):
            break
        elif ld_status is None or (ld_status == 'error' and retry_on_err):
            candidates.append(ddate)
        elif ld_status == 'error':
            break
        last_date = ddate
    candidate_string = "candidates dates for load: {0}".format(candidates)
    logstream.write_msg(status='running', extra_msg=candidate_string)
    return candidates
示例#8
0
def test_pipeline_yaml_schema_file_path():
    directory = 'directory'
    filename = 'filename.yaml'
    config = {
        'pipeline.yaml_schema_file': filename
    }
    with mock.patch.dict('os.environ', {'YELPCODE': directory}):
        with staticconf.testing.MockConfiguration(config):
            assert pipeline_yaml_schema_file_path() == \
                '{directory}/{filename}'.format(
                    directory=directory,
                    filename=filename)
示例#9
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname), input_date)
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file, expected_out_file,
                                              cores, extractions, delimiter)
        output_under_test = create_emr_args(input_date, 10, input_prefix, dev)
        assert output_under_test == formatted_args
示例#10
0
def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown')
    )

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(
        stream_name,
        args.run_local,
        's3_to_redshift',
        job_name='load'
    )

    # handle to redshift db
    loader_psql = RedshiftPostgres(
        LOG_STREAM, args.private, run_local=args.run_local
    )

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(
            LOG_STREAM, run_local=args.run_local
        )
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(
                loader_psql,
                db,
                data_candidates[0],
                s3_log_prefix,
                args.db_file,
                LOG_STREAM
            )
        except Exception as e:
            status_table.update_status(
                db,
                data_candidates[0],
                get_yaml_table_versions(pipeline_yaml_schema_file_path()),
                "error",
                start_time_secs=time.time(), error_msg=repr(e)
            )
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(
            stream_name,
            args.run_local,
            's3_to_redshift',
            job_name='load',
            input_date=input_date
        )
        logs_to_copy = [
            (join(s3_log_prefix, input_date, table), table)
            for (table, _) in create_tuples
        ]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)
示例#11
0
def copy_tables(psql_helper, status_helper,
                db_name, ddate, log_tuples, ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate,
                                log_tuple, ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb': ''.join(traceback.format_tb(exc_tb)),
                'crash_exc': traceback.format_exception_only(
                    exc_type, exc_value
                )[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate
                    )
                status_helper.update_status(
                    db_name, ddate, yaml_versions,
                    "error", start_time_secs=start, error_msg=error_msg
                )
                handle_error(error_msg, logstream)
    status_helper.update_status(
        db_name, ddate, yaml_versions, "complete", start_time_secs=start
    )
示例#12
0
def test_pipeline_yaml_schema_file_path_with_s3():
    filename = 's3://bucket/key/filename.yaml'
    config = {'pipeline.yaml_schema_file': filename}
    with staticconf.testing.MockConfiguration(config):
        assert pipeline_yaml_schema_file_path() == filename
示例#13
0
def s3_to_redshift_main(args):

    db = read_string('pipeline.redshift_database')
    s3_log_prefix = read_string('pipeline.s3_output_prefix').format(
        logname=os.environ.get('LOGNAME', 'unknown'))

    # setup logging
    stream_name = read_string('pipeline.load_step.s3_to_redshift_stream')
    LOG_STREAM = PipelineStreamLogger(stream_name,
                                      args.run_local,
                                      's3_to_redshift',
                                      job_name='load')

    # handle to redshift db
    loader_psql = RedshiftPostgres(LOG_STREAM,
                                   args.private,
                                   run_local=args.run_local)

    if args.skip_progress_in_redshift:
        status_table = DynamoDbStatusTable(LOG_STREAM,
                                           run_local=args.run_local)
    else:
        status_table = RedshiftStatusTable(loader_psql)

    create_tuples = get_table_creates(args.db_file, LOG_STREAM)

    data_candidates = dates_from_rs_status(
        status_table,
        db,
        LOG_STREAM,
        args.retry_errors,
        args.date,
    )
    if data_candidates:
        try:
            update_database_schema(loader_psql, db, data_candidates[0],
                                   s3_log_prefix, args.db_file, LOG_STREAM)
        except Exception as e:
            status_table.update_status(db,
                                       data_candidates[0],
                                       get_yaml_table_versions(
                                           pipeline_yaml_schema_file_path()),
                                       "error",
                                       start_time_secs=time.time(),
                                       error_msg=repr(e))
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(stream_name,
                                          args.run_local,
                                          's3_to_redshift',
                                          job_name='load',
                                          input_date=input_date)
        logs_to_copy = [(join(s3_log_prefix, input_date, table), table)
                        for (table, _) in create_tuples]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)
示例#14
0
def copy_tables(psql_helper, status_helper, db_name, ddate, log_tuples,
                ttl_days, logstream):
    """
    copy_tables takes a list of input log, table pairs and copies each
    input log to its corresponding input table

    Args:
    psql_helper -- a RedshiftPostgres object to help perform the copy
    status_helper -- An object handle to interact with status table
    db_name -- the name of the db to which we're copying
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    log_tuples -- a list of (log, table) pairs
    ttl_days -- how many days to retain loaded data
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start = time.time()
    yaml_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    status_helper.update_status(db_name, ddate, yaml_versions, "running")
    err_tbl_name, _ = RedShiftLogSchema().get_error_table()
    for log_tuple in log_tuples:
        result = False
        error_msg = None
        try:
            result = copy_table(psql_helper, db_name, ddate, log_tuple,
                                ttl_days, logstream)
        except KeyboardInterrupt:
            result = None
            raise
        except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            error_msg = "{0}".format({
                'crash_tb':
                ''.join(traceback.format_tb(exc_tb)),
                'crash_exc':
                traceback.format_exception_only(exc_type,
                                                exc_value)[0].strip()
            })

            # ignore copy error if error table does not exist
            s3_log, rs_table = log_tuple
            if rs_table == err_tbl_name and \
               exc_value.args[0].find('The specified S3 prefix') != -1 and \
               exc_value.args[0].find('does not exist') != -1:
                result = None
        finally:
            if result is False:
                _, rs_table = log_tuple
                if error_msg is None:
                    error_msg = "failed copy {0} for date: {1}".format(
                        get_namespaced_tablename(rs_table), ddate)
                status_helper.update_status(db_name,
                                            ddate,
                                            yaml_versions,
                                            "error",
                                            start_time_secs=start,
                                            error_msg=error_msg)
                handle_error(error_msg, logstream)
    status_helper.update_status(db_name,
                                ddate,
                                yaml_versions,
                                "complete",
                                start_time_secs=start)
示例#15
0
文件: s3_to_psv.py 项目: Yelp/mycroft
def __load_data_from_s3(
        status_helper, prefixes, date_with_slashes,
        mrjob_path, local, db_name, logstream, force_et=False
        ):
    """
    load_data_from_s3 iterates over prefixes and loads data for a
    particular date for the first prefix where the data exists.  It also
    checks whether data has already been loaded for a date and if so, skips
    the load

    Args:
    status_helper -- An object handle to interact with status table
    prefixes -- a list of s3 prefixes for input data
    date_with_slashes -- a date string of the form 'YYYY/MM/DD'
    mrjob_path -- module.entry_point of the job to extract and \
        transform the data
    local -- True if we're running locally (i.e., devc) False for aws instance
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start_time = time.time()

    table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    conditions = {
        'table_versions': table_versions,
        'data_date': date_with_slashes
    }
    if status_helper.et_started(conditions, db_name):
        logstream.write_msg(
            "complete",
            extra_msg="skipping: et_step already started"
        )
        return

    prefix_for_this_data = get_next_dir_to_load(
        prefixes, date_with_slashes, local, logstream, force_et
    )
    if not prefix_for_this_data:
        jobtime = 0
        err_msg = "no prefix available date={0}, prefixes={1}".format(
            date_with_slashes, prefixes
        )
        logstream.write_msg("error", error_msg=err_msg)
        status_helper.log_status_result(
            conditions,
            jobtime,
            db_name,
            failed=True, err_msg=err_msg
        )
        raise Exception(err_msg)

    # check if mrjob is already done
    data_we_check = "{0} {1} {2}".format(
        get_s3_output_user_prefix(),
        date_with_slashes,
        local
    )
    logstream.write_msg("running", extra_msg=data_we_check)
    if data_available(
        get_s3_output_user_prefix(),
        date_with_slashes,
        local,
        done_file_name='_SUCCESS'
    ):
        logstream.write_msg(
            "complete",
            extra_msg="skipping: et_step already done"
        )
        return

    jobtime = time.time()
    mrjob_args = create_emr_args(
        date_with_slashes,
        read_int('pipeline.et_step.cores'),
        prefix_for_this_data, local
    )
    status_helper.insert_et(conditions, db_name)
    logstream.write_msg("running", extra_msg=mrjob_args)

    result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream)
    failed = not result

    jobtime = time.time() - start_time
    status_helper.log_status_result(
        conditions, jobtime, db_name,
        failed=failed, err_msg=err_reason
    )
    if failed:
        raise Exception(err_reason)
    return
示例#16
0
def __load_data_from_s3(status_helper,
                        prefixes,
                        date_with_slashes,
                        mrjob_path,
                        local,
                        db_name,
                        logstream,
                        force_et=False):
    """
    load_data_from_s3 iterates over prefixes and loads data for a
    particular date for the first prefix where the data exists.  It also
    checks whether data has already been loaded for a date and if so, skips
    the load

    Args:
    status_helper -- An object handle to interact with status table
    prefixes -- a list of s3 prefixes for input data
    date_with_slashes -- a date string of the form 'YYYY/MM/DD'
    mrjob_path -- module.entry_point of the job to extract and \
        transform the data
    local -- True if we're running locally (i.e., devc) False for aws instance
    logstream -- a PipelineStreamLogger

    Returns:
    ---
    """
    start_time = time.time()

    table_versions = get_yaml_table_versions(pipeline_yaml_schema_file_path())
    conditions = {
        'table_versions': table_versions,
        'data_date': date_with_slashes
    }
    if status_helper.et_started(conditions, db_name):
        logstream.write_msg("complete",
                            extra_msg="skipping: et_step already started")
        return

    prefix_for_this_data = get_next_dir_to_load(prefixes, date_with_slashes,
                                                local, logstream, force_et)
    if not prefix_for_this_data:
        jobtime = 0
        err_msg = "no prefix available date={0}, prefixes={1}".format(
            date_with_slashes, prefixes)
        logstream.write_msg("error", error_msg=err_msg)
        status_helper.log_status_result(conditions,
                                        jobtime,
                                        db_name,
                                        failed=True,
                                        err_msg=err_msg)
        raise Exception(err_msg)

    # check if mrjob is already done
    data_we_check = "{0} {1} {2}".format(get_s3_output_user_prefix(),
                                         date_with_slashes, local)
    logstream.write_msg("running", extra_msg=data_we_check)
    if data_available(get_s3_output_user_prefix(),
                      date_with_slashes,
                      local,
                      done_file_name='_SUCCESS'):
        logstream.write_msg("complete",
                            extra_msg="skipping: et_step already done")
        return

    jobtime = time.time()
    mrjob_args = create_emr_args(date_with_slashes,
                                 read_int('pipeline.et_step.cores'),
                                 prefix_for_this_data, local)
    status_helper.insert_et(conditions, db_name)
    logstream.write_msg("running", extra_msg=mrjob_args)

    result, err_reason = __run_mr_job(mrjob_path, mrjob_args, logstream)
    failed = not result

    jobtime = time.time() - start_time
    status_helper.log_status_result(conditions,
                                    jobtime,
                                    db_name,
                                    failed=failed,
                                    err_msg=err_reason)
    if failed:
        raise Exception(err_reason)
    return