def create_with_emr_cluster_id(config_path, cluster_mode,
                                   destination_database,
                                   destination_environment, algorithm_instance,
                                   emr_cluster_id):
        """
        Create algorithm configuration object from acon file. Method will discover acon file based on the
        parameters passed to it.

        :return: Returns algorithm configuration object of the type that is used for calling the method.
        """

        # Create config service to get acon file path.
        config_service = ConfigService(config_path)
        acon_path = config_service.get_acon_path(cluster_mode,
                                                 destination_database,
                                                 destination_environment,
                                                 algorithm_instance)
        acon_dict = Util.load_dict(acon_path)

        environment = acon_dict[
            AlgorithmConfigurationHadoop.Sections.ENVIRONMENT]
        environment[
            AlgorithmConfigurationHadoop.Keys.EMR_CLUSTER_ID] = emr_cluster_id

        return AlgorithmConfigurationHadoop(algorithm_instance, acon_dict)
Exemplo n.º 2
0
    def read_acon_params(execution_system, table_name):
        config_service = ConfigService(execution_system.config)

        acon_path = config_service.get_acon_path(execution_system.database,
                                                 execution_system.environment,
                                                 table_name)

        acon_dict = Util.load_dict(acon_path)
        return acon_dict.get(LoadHadoop.PARAMETERS_KEY, {})
Exemplo n.º 3
0
    def _create_s3_table(self,
                         s3_resource,
                         hql_validation_function,
                         hql_validation_error=None):
        s3_resource.create_bucket(Bucket=self.LAKE_SPEC.bucket)
        for f in self.LAKE_SPEC.keys:
            logging.info("Creating object s3://{}/{}".format(
                self.LAKE_SPEC.bucket, f))
            s3_resource.Bucket(self.LAKE_SPEC.bucket).put_object(Key=f,
                                                                 Body="")

        s3_resource.create_bucket(Bucket=self.LANDING_SPEC.bucket)
        for f in self.LANDING_SPEC.keys:
            logging.info("Creating object s3://{}/{}".format(
                self.LANDING_SPEC.bucket, f))
            s3_resource.Bucket(self.LANDING_SPEC.bucket).put_object(Key=f,
                                                                    Body="")

        s3_table = S3Table.__new__(S3Table)

        # landing
        s3_table.db_table_landing = self.LANDING_SPEC.table

        dir_landing_data = "s3://{}/{}".format(self.LANDING_SPEC.bucket,
                                               self.LANDING_SPEC.data_dir)
        s3_table.dir_landing_data = dir_landing_data
        s3_table.dir_landing_work = dir_landing_data.replace("data", "work")
        s3_table.dir_landing_archive = dir_landing_data.replace(
            "data", "archive")
        s3_table.dir_landing_final = s3_table.dir_landing_data

        # lake
        s3_table.db_table_lake = self.LAKE_SPEC.table

        dir_lake_data = "s3://{}/{}".format(self.LAKE_SPEC.bucket,
                                            self.LAKE_SPEC.data_dir)
        s3_table.dir_lake_final = dir_lake_data

        s3_table.emr_system = FakeStorageSystem(hql_validation_function,
                                                hql_validation_error)
        s3_table.s3_resource = s3_resource

        test_landing_bucket_name = self.LANDING_SPEC.bucket
        test_lake_bucket_name = self.LAKE_SPEC.bucket

        s3_table.dir_landing_table = "s3://" + test_landing_bucket_name + "/" + self.LANDING_SPEC.data_dir
        s3_table.dir_lake_table = "s3://" + test_lake_bucket_name + "/" + self.LAKE_SPEC.data_dir

        s3_table.config_service = ConfigService(
            TestS3Table.DEFAULT_CONFIG_PATH)

        s3_table.partitioned_by = "month"
        s3_table.header_lines = 0
        s3_table.delimiter = "|"

        s3_table.columns_lake = [("name1", "varchar(21)"),
                                 ("name2", "varchar(6)"),
                                 ("name3", "varchar(4)")]

        return s3_table
Exemplo n.º 4
0
    def env_setup(self, local_run_dir, destination_system,
                  destination_database, destination_environment):
        m3d_config_file, scon_emr_file, m3d_config_dict, scon_emr_dict = \
            super(TestAlgorithmNestedFlattenerEMR, self).env_setup(
                self.local_run_dir,
                self.destination_system,
                self.destination_database,
                self.destination_environment
            )

        config_service = ConfigService(m3d_config_file)
        acon_path = config_service.get_acon_path(self.destination_database,
                                                 self.destination_environment,
                                                 self.algorithm_instance)

        os.makedirs(os.path.dirname(acon_path))

        acon_data = py.path.local(self.test_acon).read()
        py.path.local(acon_path).write(acon_data)

        return m3d_config_file, scon_emr_file, acon_path, m3d_config_dict, scon_emr_dict
    def create_with_ext_params(config_path, cluster_mode, destination_database,
                               destination_environment, algorithm_instance,
                               ext_params_str):
        """
        Create algorithm configuration object from acon file. Method will discover acon file based on the
        parameters passed to it.

        :return: Returns algorithm configuration object of the type that is used for calling the method.
        """

        # Create config service to get acon file path.
        config_service = ConfigService(config_path)
        acon_path = config_service.get_acon_path(cluster_mode,
                                                 destination_database,
                                                 destination_environment,
                                                 algorithm_instance)
        acon_dict = Util.load_dict(acon_path)

        if ext_params_str:
            ext_params_dict = json.loads(ext_params_str)
            acon_dict = Util.merge_nested_dicts(acon_dict, ext_params_dict)

        return AlgorithmConfigurationHadoop(algorithm_instance, acon_dict)
Exemplo n.º 6
0
    def test_parse_config_file(self):
        test_config_json = \
            """
            {
              "emails": [
                "*****@*****.**",
                "*****@*****.**",
                "*****@*****.**"
              ],
              "dir_exec": "/tmp/",
              "python": {
                "main": "m3d_main.pyc",
                "base_package": "m3d"
              },
              "subdir_projects": {
                "m3d_engine": "m3d-engine/target/scala-2.10/",
                "m3d_api": "m3d-api/"
              },
              "tags": {
                "table_suffix_stage": "_stg1",
                "table_suffix_swap": "_swap",

                "full_load": "full_load",
                "delta_load": "delta_load",
                "append_load": "append_load",
                "oozie": "oozie",
                "decom_gzip": "gzip_decompressor",

                "false": "false",

                "config": "config",
                "system": "system",
                "algorithm": "algorithm",
                "table": "table",
                "view": "view",
                "upload": "upload",
                "pushdown": "pushdown",

                "aws": "aws",
                "hdfs": "hdfs",
                "file": "file"
              },
              "data_dict_delimiter": "|"
            }
            """

        with patch("builtins.open",
                   new_callable=mock_open,
                   read_data=test_config_json):
            config_service = ConfigService("_")

        expected_params = {
            "emails": [
                "*****@*****.**", "*****@*****.**",
                "*****@*****.**"
            ],
            "python_main":
            "m3d_main.pyc",
            "python_base_package":
            "m3d",
            "dir_exec":
            "/tmp/",
            "subdir_projects_m3d_engine":
            "m3d-engine/target/scala-2.10/",
            "subdir_projects_m3d_api":
            "m3d-api/",
            "data_dict_delimiter":
            "|",
            "tag_table_suffix_stage":
            "_stg1",
            "tag_table_suffix_swap":
            "_swap",
            "tag_full_load":
            "full_load",
            "tag_delta_load":
            "delta_load",
            "tag_append_load":
            "append_load",
            "tag_system":
            "system",
            "tag_table":
            "table",
            "tag_algorithm":
            "algorithm",
            "tag_config":
            "config",
            "tag_pushdown":
            "pushdown",
            "tag_upload":
            "upload",
            "tag_aws":
            "aws",
            "tag_file":
            "file"
        }

        for param in expected_params:
            assert getattr(config_service, param) == expected_params[param]