Exemplo n.º 1
0
def pgsql(request):
    YamlConfiguration('tests/batch/dummy_config.yaml')
    from sherlock.common.redshift_psql import RedshiftPostgres
    logstream = 'dummy_stream'
    psql_auth_file = 'tests/batch/dummy_user_file.txt'
    with mock.patch('sherlock.common.aws.fetch_creds',
                    autospec=True) as fetch_creds:
        fetch_creds.return_value = {
            "Code":
            "Success",
            "LastUpdated":
            "2014-03-12T17:17:07Z",
            "Type":
            "AWS-HMAC",
            "AccessKeyId":
            "My Access Key Id",
            "SecretAccessKey":
            "My Secret Access Key",
            "Token":
            "My Token",
            "Expiration":
            (datetime.utcnow() +
             timedelta(seconds=60)).strftime("%Y-%m-%dT%H:%M:%SZ")
        }
        run_local = request.param
        if run_local is True:
            # local file contains expired creds, test assertion code
            with pytest.raises(Exception):
                RedshiftPostgres(logstream, psql_auth_file, run_local)
        return RedshiftPostgres(logstream, psql_auth_file, run_local=False)
Exemplo n.º 2
0
def load_io_yaml_from_args(io_yaml_arg):
    # Load either a YAML file or a dictionary string for io_yaml
    try:
        arg = ast.literal_eval(io_yaml_arg)
    except ValueError:
        arg = io_yaml_arg

    if type(arg) == str:
        YamlConfiguration(arg, optional=False)
    elif type(arg) == dict:
        DictConfiguration(arg, optional=False)
    else:
        raise ValueError(
            "Arg io_yaml is not a file path or a dictionary, was:" +
            str(type(arg)))
Exemplo n.º 3
0
    def __init__(self, logstrm, psql_auth_file, run_local=False):

        self.run_local = run_local
        self.host = staticconf.read_string('redshift_host')
        self.port = staticconf.read_int('redshift_port')
        private_dict = YamlConfiguration(psql_auth_file)
        self.user = private_dict['redshift_user']
        self.password = private_dict['redshift_password']
        self.log_stream = logstrm
        self._aws_key = ''
        self._aws_secret = ''
        self._aws_token = ''
        self._aws_token_expiry = datetime.utcnow()
        self._whitelist = ['select', 'create', 'insert', 'update']
        self._set_aws_auth()
        psycopg2.extensions.set_wait_callback(wait_select_inter)
Exemplo n.º 4
0
def setup_private(input_args):
    """
    setup_private sets up the aws credentials required to run on the server
    in the appropriate environment variables

    Args:
    local -- True if we're on dev, False if on stageb
    input_args -- input yaml file with aws access_key_id and secret_access_key

    Returns
    a yaml file with the private information in it
    ---
    """

    YamlConfiguration(input_args, optional=True)
    os.environ['AWS_ACCESS_KEY_ID'] = read_string('emr_aws_access_key_id')
    os.environ['AWS_SECRET_ACCESS_KEY'] = \
        read_string('emr_aws_secret_access_key')
Exemplo n.º 5
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname), input_date)
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file, expected_out_file,
                                              cores, extractions, delimiter)
        output_under_test = create_emr_args(input_date, 10, input_prefix, dev)
        assert output_under_test == formatted_args
Exemplo n.º 6
0
def test_get_namespaced_tablename_config(input_config, expected_out):
    filepath = os.path.join('tests', 'common', input_config)
    YamlConfiguration(filepath)
    output_under_test = get_namespaced_tablename("table_name_blah")
    assert output_under_test == expected_out
Exemplo n.º 7
0
                                       start_time_secs=time.time(),
                                       error_msg=repr(e))
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(stream_name,
                                          args.run_local,
                                          's3_to_redshift',
                                          job_name='load',
                                          input_date=input_date)
        logs_to_copy = [(join(s3_log_prefix, input_date, table), table)
                        for (table, _) in create_tuples]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)


if __name__ == '__main__':

    args_namespace = parse_command_line(sys.argv)

    load_package_config(args_namespace.config)
    YamlConfiguration(args_namespace.io_yaml, optional=False)
    if args_namespace.config_override:
        YamlConfiguration(args_namespace.config_override, optional=False)

    s3_to_redshift_main(args_namespace)
Exemplo n.º 8
0
def merge_configs(fname_list):
    base_fname = fname_list.pop(0)
    config = load_package_config(base_fname)
    for fname in fname_list:
        YamlConfiguration(fname, optional=True)
    return config