Пример #1
0
    def _generate_version_file(self):
        def read_deployment_history():
            if os.path.exists(self.version_filename):
                with open(self.version_filename, 'r') as version_file:
                    return version_file.read().strip()
            else:
                return None

        def update_deployment_history(current_line, history=None):
            with open(self.version_filename, 'w') as version_file:
                if history is None:
                    version_file.writelines([current_line + "\n"])
                else:
                    version_file.writelines([current_line + "\n", history + "\n"])

        deployment_history = read_deployment_history()
        current_time = datetime.datetime.now()
        branch_name = Util.execute_subprocess("git status | grep -E 'On branch .*' | tail -c +11")
        last_commit = Util.execute_subprocess("git log -1 | grep -E 'commit .*' | tail -c +8")

        current_deployment = "{date} {branch} ({commit})".format(
            date=current_time.strftime("%Y-%m-%d %H:%M:%S"),
            branch=branch_name.strip(),
            commit=last_commit.strip()
        )
        update_deployment_history(current_deployment, deployment_history)
Пример #2
0
 def test_send_mail(self, os_system_patch):
     Util.send_email(["*****@*****.**"], "hello", "hello")
     os_system_patch.assert_has_calls([
         call(
             'echo "hello" | mailx -s "hello" [email protected]'
         )
     ])
Пример #3
0
    def _report_error(self, name):
        error_subject = "Error for " + name
        exec_tb = traceback.format_exc()
        message = "Error in executing {}. \n Stacktrace: \n {}".format(
            name, exec_tb)

        logging.error(error_subject)
        Util.send_email(self._execution_system.config_service.emails,
                        error_subject, message)
Пример #4
0
    def test_get_target_partitions_string(self):
        """
        This method tests the correct functionality of get_partition_column_string of Util class
        :return:
        """
        assert Util.get_target_partitions_string("year") == "year"
        assert Util.get_target_partitions_string("month") == "year,month"
        assert Util.get_target_partitions_string("day") == "year,month,day"
        assert Util.get_target_partitions_string("") == ""

        assert Util.get_target_partitions_string("country") == "country"
Пример #5
0
 def test_get_defined_partition_columns_hive(self):
     """
     This method tests the correct functionality of get_partition_column_string of Util class
     :return:
     """
     assert Util.get_defined_target_partitions_hive(
         "year") == "year smallint"
     assert Util.get_defined_target_partitions_hive(
         "month") == "year smallint,month smallint"
     assert Util.get_defined_target_partitions_hive(
         "day") == "year smallint,month smallint,day smallint"
     assert Util.get_defined_target_partitions_hive("") == ""
Пример #6
0
    def test_get_target_partitions_list(self):
        """
        This method tests the correct functionality of get_partition_column_list of Util class
        :return:
        """
        assert Util.get_target_partitions_list("year") == ["year"]
        assert Util.get_target_partitions_list("month") == ["year", "month"]
        assert Util.get_target_partitions_list("day") == [
            "year", "month", "day"
        ]
        assert Util.get_target_partitions_list("") == []

        assert Util.get_target_partitions_list("country") == ["country"]
    def create_with_emr_cluster_id(config_path, cluster_mode,
                                   destination_database,
                                   destination_environment, algorithm_instance,
                                   emr_cluster_id):
        """
        Create algorithm configuration object from acon file. Method will discover acon file based on the
        parameters passed to it.

        :return: Returns algorithm configuration object of the type that is used for calling the method.
        """

        # Create config service to get acon file path.
        config_service = ConfigService(config_path)
        acon_path = config_service.get_acon_path(cluster_mode,
                                                 destination_database,
                                                 destination_environment,
                                                 algorithm_instance)
        acon_dict = Util.load_dict(acon_path)

        environment = acon_dict[
            AlgorithmConfigurationHadoop.Sections.ENVIRONMENT]
        environment[
            AlgorithmConfigurationHadoop.Keys.EMR_CLUSTER_ID] = emr_cluster_id

        return AlgorithmConfigurationHadoop(algorithm_instance, acon_dict)
Пример #8
0
    def __init__(self, execution_system, algorithm_instance, algorithm_params):
        """
        Initialize generic Algorithm class

        :param execution_system: an instance of execution system
        :param algorithm_instance: name of the algorithm instance
        :param algorithm_params: algorithm configuration
        """

        self._execution_system = execution_system
        self._parameters = algorithm_params.get(AlgorithmConfigurationHadoop.Keys.PARAMETERS, {})

        param_file_basename = "{system}-{database}-{environment}.{algorithm}.{time}{extension}".format(
            system=self._execution_system.source_system,
            database=self._execution_system.database,
            environment=self._execution_system.environment,
            algorithm=algorithm_instance,
            time=Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT),
            extension=ConfigService.Extensions.JSON
        )

        # derived
        dir_apps_algorithm_instance = os.path.join(
            self._execution_system.dir_apps_algorithm,
            algorithm_instance
        )

        self._params_uri_cluster = os.path.join(dir_apps_algorithm_instance, param_file_basename)
        self._params_uri_local = os.path.join(self._execution_system.config_service.dir_exec, param_file_basename)
Пример #9
0
    def _get_create_lake_statement(self, table_location):
        def create_statement(_columns, _target_partitions=None):
            return HQLGenerator.CreateParquetTableStatementBuilder(self.db_table_lake, table_location, _columns) \
                .partitioned_by(_target_partitions) \
                .with_properties({"serialization.encoding": "UTF-8"}) \
                .build(is_external=True)

        if self.partitioned_by in Util.defined_partitions:
            return create_statement(
                self.columns_lake,
                Util.get_typed_target_partitions_hive(self.partitioned_by))
        elif len(self.partitioned_by) > 0:
            matched_columns = list(
                filter(lambda x: x[0] == self.partitioned_by,
                       self.columns_lake))
            if len(matched_columns) > 0:
                # when table is partitioned by one of its columns
                # then partition column should to excluded from list of regular columns
                columns = filter(lambda x: x[0] != self.partitioned_by,
                                 self.columns_lake)
                target_partitions = [(matched_columns[0][0],
                                      matched_columns[0][1])]
                return create_statement(columns, target_partitions)
            else:
                raise Exception(
                    "Partitioned field doesn't match any column".format(
                        self.partitioned_by))
        else:
            return create_statement(self.columns_lake)
Пример #10
0
 def get_projection_columns(self, src_column_names, destination_column_names):
     columns = list(filter(lambda x: x[1], zip(src_column_names, destination_column_names)))
     if self.partitioned_by in Util.defined_partitions:
         partition_columns = list(map(lambda x: (x, x), Util.get_partition_columns_list(self.partitioned_by)))
         return columns + partition_columns
     else:
         return columns
Пример #11
0
    def read_acon_params(execution_system, table_name):
        config_service = ConfigService(execution_system.config)

        acon_path = config_service.get_acon_path(execution_system.database,
                                                 execution_system.environment,
                                                 table_name)

        acon_dict = Util.load_dict(acon_path)
        return acon_dict.get(LoadHadoop.PARAMETERS_KEY, {})
    def create_with_ext_params(config_path, cluster_mode, destination_database,
                               destination_environment, algorithm_instance,
                               ext_params_str):
        """
        Create algorithm configuration object from acon file. Method will discover acon file based on the
        parameters passed to it.

        :return: Returns algorithm configuration object of the type that is used for calling the method.
        """

        # Create config service to get acon file path.
        config_service = ConfigService(config_path)
        acon_path = config_service.get_acon_path(cluster_mode,
                                                 destination_database,
                                                 destination_environment,
                                                 algorithm_instance)
        acon_dict = Util.load_dict(acon_path)

        if ext_params_str:
            ext_params_dict = json.loads(ext_params_str)
            acon_dict = Util.merge_nested_dicts(acon_dict, ext_params_dict)

        return AlgorithmConfigurationHadoop(algorithm_instance, acon_dict)
Пример #13
0
    def setup_acon_from_file(config_dir_path, destination_database,
                             destination_environment, algorithm_instance,
                             base_acon_path):
        acon_file_path = AconHelper.get_acon_file_path(
            config_dir_path, destination_database, destination_environment,
            algorithm_instance)

        if not os.path.isdir(os.path.dirname(acon_file_path)):
            os.makedirs(os.path.dirname(acon_file_path))

        py.path.local(acon_file_path).write(
            py.path.local(base_acon_path).read())
        acon_dict = Util.load_dict(base_acon_path)

        return acon_file_path, acon_dict
Пример #14
0
    def test_get_target_partitions_string(self):
        """
        This method tests the correct functionality of get_partition_column_string of Util class
        :return:
        """
        assert Util.get_target_partitions_string("year") == "year"
        assert Util.get_target_partitions_string("month") == "year,month"
        assert Util.get_target_partitions_string("day") == "year,month,day"
        assert Util.get_target_partitions_string("") == ""

        with pytest.raises(Exception) as exc_info:
            Util.get_target_partitions_list("country")
        assert "Partition type country not supported" in str(exc_info.value)
Пример #15
0
    def execute_hive(self, hql, return_output=False):
        # Put HQL statement to a file since it can be longer than allowed length of EMR step parameter.
        datetime_str = Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT)
        id_str = EMRSystem._generate_random_id()

        hql_filename = "{}.{}{}".format(datetime_str, id_str,
                                        ConfigService.Extensions.HQL)
        hql_path_local = os.path.join(self.dir_tmp_local, hql_filename)
        hql_path_s3 = os.path.join(self.dir_tmp_s3, hql_filename)

        with open(hql_path_local, "w") as hql_file:
            hql_file.write(hql)

        self.s3_util.upload_object(hql_path_local, hql_path_s3)

        # Create hive command line.
        hive_cmd = "hive --silent -f {}".format(hql_path_s3)

        # Add step to EMR cluster.
        step_name = "Hive EMR Step: datetime=\"{}\", id=\"{}\"".format(
            datetime_str, id_str)
        emr_step_id = self.emr_cluster_client.add_step(step_name, hive_cmd)

        self.emr_cluster_client.wait_for_step_completion(emr_step_id)

        if return_output:
            output_file = self.emr_cluster_client.get_step_output_path(
                emr_step_id)
            logging.info(
                "Waiting for availability of output file: '{}'.".format(
                    output_file))

            self.s3_util.wait_for_file_availability(
                output_file, self.emr_cluster_client.polling_interval_seconds,
                EMRClusterClient.AWSConstants.
                S3_FILE_AVAILABILITY_TIMEOUT_SECONDS)
            file_content = self.s3_util.read_gzip_file_content(output_file)

            return file_content

        return None
Пример #16
0
    def test_oracle_view_to_hive_view(self):
        oracle_view_ddl = \
            "\n  CREATE OR REPLACE FORCE EDITIONABLE VIEW \"MART_MOD\".\"TEST_VIEW\" (" + \
            "" + \
            "\"GENDER\", \"GROUP_ARTICLE\", \"BRAND\", \"GROUP_MODEL\", " + \
            "\"RMH_PRODUCT_DIVISION\", \"RMH_GENDER\", \"RMH_CATEGORY\", \"RMH_PRODUCT_TYPE\", " + \
            "\"BUSINESS_SEGMENT\", \"BUSINESS_UNIT\", \"COLORWAY_NAME\", \"SEASON_ACTIVE\", " + \
            "\"SEASON_CREATE\", \"SIZE_PAGE\", \"KEY_CATEGORY\", \"SUB_BRAND\", " + \
            "\"CORPORATE_MARKETING_LINE\", \"PRODUCT_DIVISION\", \"ORDER_LOCKED\", \"PRODUCT_GROUP\", " + \
            "\"PRODUCT_TYPE\", \"SPORTS_CATEGORY\", \"SOURCING_SIZE_SCALE\", \"RMH_RETAIL_CLASS\", " + \
            "\"RMH_RETAIL_DEPARTMENT\", \"RMH_RETAIL_SUB_CLASS\", \"RMH_RETAIL_SUB_DEPT\", \"RMH_RETAIL_SECTION\", " + \
            "\"AGE_GROUP\", \"ALTERNATE_ARTICLE\", \"ARTICLE_TYPE\", \"COLORWAY_LONG_DESCR\", " + \
            "\"COLORWAY_SHORT_DESCR\", \"LIFECYCLE_STATUS_DATE\", \"ORIGINAL_ARTICLE\", \"ARTICLE_DESCR\", " + \
            "\"VENDOR_ARTICLE\", \"SALES_LINE\", \"CATEGORY_MARKETING_LINE\"" + \
            "" + \
            ") AS \n  SELECT \n" + \
            "" + \
            "gender,\ngroup_article,\nbrand,\ngroup_model,\n" + \
            "rmh_product_division,\nrmh_gender,\nrmh_category,\nrmh_product_type,\n" + \
            "business_segment,\nbusiness_unit,\ncolorway_name,\nseason_active,\n" + \
            "season_create,\nsize_page,\nkey_category,\nsub_brand,\n" + \
            "corporate_marketing_line,\nproduct_division,\norder_locked,\nproduct_group,\n" + \
            "product_type,\nsports_category,\nsourcing_size_scale,\nrmh_retail_class,\n" + \
            "rmh_retail_department,\nrmh_retail_sub_class,\nrmh_retail_sub_dept,\nrmh_retail_section,\n" + \
            "age_group,\nalternate_article,\narticle_type,\ncolorway_long_descr,\n" + \
            "colorway_short_descr,\nlifecycle_status_date,\noriginal_article,\narticle_descr,\n" + \
            "vendor_article,\nSALES_LINE,\n" + \
            "category_marketing_line\n" + \
            "" + \
            "FROM \n" + \
            "lake_out.bi_test_view"

        hive_view_ddl = Util.oracle_view_to_hive_view(oracle_view_ddl)

        expected_hive_ddl = \
            "CREATE VIEW `MART_MOD`.`TEST_VIEW` " + \
            "(" + \
            "`GENDER`, `GROUP_ARTICLE`, `BRAND`, `GROUP_MODEL`, " + \
            "`RMH_PRODUCT_DIVISION`, `RMH_GENDER`, `RMH_CATEGORY`, `RMH_PRODUCT_TYPE`, " + \
            "`BUSINESS_SEGMENT`, `BUSINESS_UNIT`, `COLORWAY_NAME`, `SEASON_ACTIVE`, " + \
            "`SEASON_CREATE`, `SIZE_PAGE`, `KEY_CATEGORY`, `SUB_BRAND`, " + \
            "`CORPORATE_MARKETING_LINE`, `PRODUCT_DIVISION`, `ORDER_LOCKED`, `PRODUCT_GROUP`, " + \
            "`PRODUCT_TYPE`, `SPORTS_CATEGORY`, `SOURCING_SIZE_SCALE`, `RMH_RETAIL_CLASS`, " + \
            "`RMH_RETAIL_DEPARTMENT`, `RMH_RETAIL_SUB_CLASS`, `RMH_RETAIL_SUB_DEPT`, `RMH_RETAIL_SECTION`, " + \
            "`AGE_GROUP`, `ALTERNATE_ARTICLE`, `ARTICLE_TYPE`, `COLORWAY_LONG_DESCR`, " + \
            "`COLORWAY_SHORT_DESCR`, `LIFECYCLE_STATUS_DATE`, `ORIGINAL_ARTICLE`, `ARTICLE_DESCR`, " + \
            "`VENDOR_ARTICLE`, `SALES_LINE`, `CATEGORY_MARKETING_LINE`" + \
            ") " + \
            "AS SELECT " + \
            "gender, group_article, brand, group_model, " + \
            "rmh_product_division, rmh_gender, rmh_category, rmh_product_type, " + \
            "business_segment, business_unit, colorway_name, season_active, " + \
            "season_create, size_page, key_category, sub_brand, " + \
            "corporate_marketing_line, product_division, order_locked, product_group, " + \
            "product_type, sports_category, sourcing_size_scale, rmh_retail_class, " + \
            "rmh_retail_department, rmh_retail_sub_class, rmh_retail_sub_dept, rmh_retail_section, " + \
            "age_group, alternate_article, article_type, colorway_long_descr, " + \
            "colorway_short_descr, lifecycle_status_date, original_article, article_descr, " + \
            "vendor_article, SALES_LINE, category_marketing_line " + \
            "FROM lake_out.bi_test_view"

        assert hive_view_ddl == expected_hive_ddl
Пример #17
0
 def _report_success(self, name):
     success_subject = "Success for " + name
     logging.info(success_subject)
     Util.send_email(self._execution_system.config_service.emails,
                     success_subject, success_subject)
Пример #18
0
    def test_load_table_delta(self, remove_json_patch, add_tags_patch, _0, _1):
        # responses.add_passthru(self.default_server_url)

        destination_system = "bdp"
        destination_database = "emr_test"
        destination_environment = "dev"
        destination_active_table = "bi_test101"
        destination_changelog_table = "bi_test101_cl"

        load_type = "DeltaLoad"

        src_tconx_path = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101.json"
        src_tconx_cl_table = "test/resources/test_load_table_delta_s3/tconx-bdp-emr_test-dev-bi_test101_cl.json"

        spark_external_parameters = '''{
                    "spark.driver.memory": "99G",
                    "spark.executor.instances": "99",
                    "spark.executor.memory": "90G"
                }
                '''

        # pass desired content of tconx files for active and changelog tables to self.env_setup()
        src_tconx_content = py.path.local(src_tconx_path).read()
        src_tconx_cl_content = py.path.local(src_tconx_cl_table).read()

        m3d_config_file, scon_emr_file, tconx_file, tconx_cl_file, m3d_config_dict, scon_emr_dict = \
            self.env_setup(
                self.local_run_dir,
                destination_system,
                destination_database,
                destination_environment,
                destination_active_table,
                src_tconx_content,
                src_tconx_cl_content
            )

        emr_system = EMRSystem(m3d_config_file, destination_system,
                               destination_database, destination_environment)
        s3_table_active = S3Table(emr_system, destination_active_table)
        s3_table_changelog = S3Table(emr_system, destination_changelog_table)

        # Extract bucket names
        bucket_application = scon_emr_dict["environments"][
            destination_environment]["s3_buckets"]["application"]

        # Put lake data for changelog table, this should be archived
        self.dump_data_to_s3(
            os.path.join(s3_table_changelog.dir_lake_final,
                         "changelog.parquet"),
            "t|e|s|t|a|d|i|d|a|s|m|3|d|",
        )

        M3D.load_table(m3d_config_file,
                       destination_system,
                       destination_database,
                       destination_environment,
                       destination_active_table,
                       load_type,
                       self.emr_cluster_id,
                       spark_params=spark_external_parameters)

        filename_json = "delta_load-{environment}-{table}.json".format(
            environment=destination_environment,
            table=destination_active_table)

        # Checking configuration file for m3d-engine
        app_files = self.get_child_objects(s3_table_active.dir_apps_delta_load)

        assert len(app_files) == 1

        assert app_files[
            0] == s3_table_active.dir_apps_delta_load + filename_json

        delta_load_config_s3 = app_files[0]
        delta_load_config_content = self.get_object_content_from_s3(
            delta_load_config_s3)

        load_table_parameters = json.loads(delta_load_config_content)

        assert load_table_parameters[
            "active_records_table_lake"] == s3_table_active.db_table_lake
        assert load_table_parameters[
            "active_records_dir_lake"] == s3_table_active.dir_lake_final
        assert load_table_parameters[
            "delta_records_file_path"] == s3_table_active.dir_landing_data
        assert load_table_parameters["technical_key"] == [
            "m3d_timestamp", "datapakid", "partno", "record"
        ]
        assert load_table_parameters[
            "business_key"] == s3_table_active.business_key

        if s3_table_active.partitioned_by in Util.defined_partitions:
            target_partitions = Util.get_target_partitions_list(
                s3_table_active.partitioned_by)
        else:
            target_partitions = s3_table_active.partitioned_by

        assert load_table_parameters["target_partitions"] == target_partitions
        assert load_table_parameters[
            "partition_column"] == s3_table_active.partition_column
        assert load_table_parameters[
            "partition_column_format"] == s3_table_active.partition_column_format

        # Check EMR steps.
        fake_cluster = self.mock_emr.backends[
            self.default_aws_region].clusters[self.emr_cluster_id]

        assert 1 == len(fake_cluster.steps)

        expected_algorithms_jar_path = "s3://" + bucket_application + os.path.join(
            scon_emr_dict["environments"][destination_environment]
            ["s3_deployment_dir_base"], destination_environment,
            scon_emr_dict["subdir"]["m3d"],
            m3d_config_dict["subdir_projects"]["m3d_api"],
            scon_emr_dict["spark"]["jar_name"])

        delta_load_step = fake_cluster.steps[0]

        assert delta_load_step.jar == "command-runner.jar"
        assert delta_load_step.args[0] == "spark-submit"

        assert delta_load_step.args[
            -5] == "com.adidas.analytics.AlgorithmFactory"
        assert delta_load_step.args[-4] == expected_algorithms_jar_path
        assert delta_load_step.args[-3] == "DeltaLoad"
        assert delta_load_step.args[-2] == delta_load_config_s3
        assert delta_load_step.args[-1] == "s3"

        add_tags_patch_call_args_list = add_tags_patch.call_args_list
        assert len(add_tags_patch_call_args_list) == 1
        assert sorted(add_tags_patch_call_args_list[0][0][0],
                      key=lambda x: x["Key"]) == sorted([{
                          "Key": "ApiMethod",
                          "Value": "load_table"
                      }, {
                          "Key": "LoadType",
                          "Value": "DeltaLoad"
                      }, {
                          "Key": "TargetTable",
                          "Value": "bi_test101"
                      }],
                                                        key=lambda x: x["Key"])

        remove_json_patch.assert_called_once()
        assert remove_json_patch.call_args_list[0][0][0] == app_files[0]
Пример #19
0
    def setup_oracle_scon(config_dir,
                          source_system,
                          db_cd,
                          base_scon_path,
                          database_type=None):
        # Making sure that we can accept both strings as well as py.path.local objects.
        config_dir = py.path.local(str(config_dir))

        config_system_dir = config_dir.join("system")
        config_credentials_dir = config_dir.join("credentials")

        if not config_system_dir.check():
            config_system_dir.mkdir()

        if not config_credentials_dir.check():
            config_credentials_dir.mkdir()

        oracle_docker_ip = os.getenv("ORACLE_DOCKER_IP", "")
        credentials_data = {
            "oracle_conn_string": {
                "lake":
                "LAKE/test_lake_password@%s:1521/XE" % oracle_docker_ip,
                "lake_out":
                "LAKE_OUT/test_lake_out_password@%s:1521/XE" %
                oracle_docker_ip,
                "m3d":
                "M3D/test_m3d_password@%s:1521/XE" % oracle_docker_ip,
                "mart_mod":
                "MART_MOD/test_mart_mod_password@%s:1521/XE" %
                oracle_docker_ip,
                "mart_cal":
                "MART_CAL/test_mart_cal_password@%s:1521/XE" %
                oracle_docker_ip,
                "mart_out":
                "MART_OUT/test_mart_out_password@%s:1521/XE" %
                oracle_docker_ip,
                "test_lake":
                "TEST_LAKE/test_lake_password@%s:1521/XE" % oracle_docker_ip,
                "test_lake_out":
                "TEST_LAKE_OUT/test_lake_out_password@%s:1521/XE" %
                oracle_docker_ip,
                "test_mart_mod":
                "TEST_MART_MOD/test_mart_mod_password@%s:1521/XE" %
                oracle_docker_ip,
                "test_mart_cal":
                "TEST_MART_CAL/test_mart_cal_password@%s:1521/XE" %
                oracle_docker_ip,
                "test_mart_out":
                "TEST_MART_OUT/test_mart_out_password@%s:1521/XE" %
                oracle_docker_ip,
                "dev_mart_mod":
                "DEV_MART_MOD/dev_mart_mod_password@%s:1521/XE" %
                oracle_docker_ip,
                "dev_mart_cal":
                "DEV_MART_CAL/dev_mart_cal_password@%s:1521/XE" %
                oracle_docker_ip,
                "dve_mart_out":
                "DEV_MART_OUT/dev_mart_out_password@%s:1521/XE" %
                oracle_docker_ip
            }
        }

        credentials_filename = "credentials-{}-{}.json".format(
            source_system, db_cd)
        credentials_file = config_credentials_dir.join(credentials_filename)
        credentials_file.write(json.dumps(credentials_data, indent=4))

        scon_dict = Util.load_dict(base_scon_path)
        scon_dict["credentials"] = str(credentials_file)

        if database_type:
            scon_dict["database_type"] = database_type

        scon_filename = "scon-{}-{}.json".format(source_system, db_cd)
        scon_file = config_system_dir.join(scon_filename)
        scon_file.write(json.dumps(scon_dict, indent=4))

        return str(scon_file), scon_dict