Пример #1
0
    def construct_job(self, input_dict):
        """Submit emr job."""
        required_fields = [
            'environment', 'data_version', 'bucket_name', 'github_repo'
        ]

        missing_fields = check_field_exists(input_dict, required_fields)

        if missing_fields:
            logger.error("Missing the parameters in input_dict",
                         extra={"missing_fields": missing_fields})
            raise ValueError(
                "Required fields are missing in the input {}".format(
                    missing_fields))

        self.env = input_dict.get('environment')
        self.data_version = input_dict.get('data_version')
        github_repo = input_dict.get('github_repo')
        if not check_url_alive(github_repo):
            logger.error(
                "Unable to find the github_repo {}".format(github_repo))
            raise ValueError(
                "Unable to find the github_repo {}".format(github_repo))
        self.training_repo_url = github_repo
        self.hyper_params = input_dict.get('hyper_params', '{}')
        aws_access_key = os.getenv("AWS_S3_ACCESS_KEY_ID") \
            or input_dict.get('aws_access_key')
        aws_secret_key = os.getenv("AWS_S3_SECRET_ACCESS_KEY")\
            or input_dict.get('aws_secret_key')
        github_token = os.getenv("GITHUB_TOKEN",
                                 input_dict.get('github_token'))
        self.bucket_name = input_dict.get('bucket_name')
        if self.hyper_params:
            try:
                self.hyper_params = json.dumps(input_dict.get('hyper_params'),
                                               separators=(',', ':'))
            except Exception:
                logger.error(
                    "Invalid hyper params",
                    extra={"hyper_params": input_dict.get('hyper_params')})

        self.properties = {
            'AWS_S3_ACCESS_KEY_ID': aws_access_key,
            'AWS_S3_SECRET_ACCESS_KEY': aws_secret_key,
            'AWS_S3_BUCKET_NAME': self.bucket_name,
            'MODEL_VERSION': self.data_version,
            'DEPLOYMENT_PREFIX': self.env,
            'GITHUB_TOKEN': github_token
        }

        self.aws_emr = AmazonEmr(aws_access_key_id=aws_access_key,
                                 aws_secret_access_key=aws_secret_key)

        self.aws_emr_client = self.aws_emr.connect()

        if not self.aws_emr.is_connected():
            logger.error("Unable to connect to emr instance.")
            raise ValueError

        logger.info("Successfully connected to emr instance.")
def emr(request):

    emr = AmazonEmr(aws_access_key_id=AWS_KEY,
                    aws_secret_access_key=AWS_SECRET)
    emr.connect()
    assert emr.is_connected()

    def teardown():
        emr.disconnect()

    request.addfinalizer(teardown)
    return emr
 def test_disconnect_with_creds(self):
     emr = AmazonEmr(aws_access_key_id=AWS_KEY,
                     aws_secret_access_key=AWS_SECRET,
                     bucket_name=BUCKET)
     emr.connect()
     emr.disconnect()
     assert not emr.is_connected()
 def test_connect_with_creds(self):
     emr = AmazonEmr(aws_access_key_id=AWS_KEY,
                     aws_secret_access_key=AWS_SECRET)
     emr.connect()
     assert emr.is_connected()
 def test_connect_without_creds(self):
     with pytest.raises(NotFoundAccessKeySecret):
         emr = AmazonEmr()
         emr.connect()
class EMRScriptBuilder(AbstractEMR):
    """EMR Script implementation."""

    def __init__(self):
        """Initialize the EMRScriptBuilder instance."""
        self.current_time = strftime("%Y_%m_%d_%H_%M_%S", gmtime())

    def construct_job(self, input_dict):
        """Submit emr job."""
        required_fields = ['environment', 'data_version',
                           'bucket_name', 'github_repo']

        missing_fields = check_field_exists(input_dict, required_fields)

        if missing_fields:
            logger.error("Missing the parameters in input_dict",
                         extra={"missing_fields": missing_fields})
            raise ValueError("Required fields are missing in the input {}"
                             .format(missing_fields))

        self.env = input_dict.get('environment')
        self.data_version = input_dict.get('data_version')
        github_repo = input_dict.get('github_repo')
        if not check_url_alive(github_repo):
            logger.error("Unable to find the github_repo {}".format(github_repo))
            raise ValueError("Unable to find the github_repo {}".format(github_repo))
        self.training_repo_url = github_repo
        self.hyper_params = input_dict.get('hyper_params', '{}')
        aws_access_key = os.getenv("AWS_S3_ACCESS_KEY_ID") \
            or input_dict.get('aws_access_key')
        aws_secret_key = os.getenv("AWS_S3_SECRET_ACCESS_KEY")\
            or input_dict.get('aws_secret_key')
        self.bucket_name = input_dict.get('bucket_name')
        if self.hyper_params:
            try:
                self.hyper_params = json.dumps(input_dict.get('hyper_params'),
                                               separators=(',', ':'))
            except Exception:
                logger.error("Invalid hyper params",
                             extra={"hyper_params": input_dict.get('hyper_params')})

        self.properties = {
            'AWS_S3_ACCESS_KEY_ID': aws_access_key,
            'AWS_S3_SECRET_ACCESS_KEY': aws_secret_key,
            'AWS_S3_BUCKET_NAME': self.bucket_name,
            'MODEL_VERSION': self.data_version,
            'DEPLOYMENT_PREFIX': self.env
        }

        self.aws_emr = AmazonEmr(aws_access_key_id=aws_access_key,
                                 aws_secret_access_key=aws_secret_key)

        self.aws_emr_client = self.aws_emr.connect()

        if not self.aws_emr.is_connected():
            logger.error("Unable to connect to emr instance.")
            raise ValueError

        logger.info("Successfully connected to emr instance.")

    def run_job(self, input_dict):
        """Run the emr job."""
        raise NotImplementedError