def __init__(self, execution_system, algorithm_instance, algorithm_params): """ Initialize generic Algorithm class :param execution_system: an instance of execution system :param algorithm_instance: name of the algorithm instance :param algorithm_params: algorithm configuration """ self._execution_system = execution_system self._parameters = algorithm_params.get(AlgorithmConfigurationHadoop.Keys.PARAMETERS, {}) param_file_basename = "{system}-{database}-{environment}.{algorithm}.{time}{extension}".format( system=self._execution_system.source_system, database=self._execution_system.database, environment=self._execution_system.environment, algorithm=algorithm_instance, time=Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT), extension=ConfigService.Extensions.JSON ) # derived dir_apps_algorithm_instance = os.path.join( self._execution_system.dir_apps_algorithm, algorithm_instance ) self._params_uri_cluster = os.path.join(dir_apps_algorithm_instance, param_file_basename) self._params_uri_local = os.path.join(self._execution_system.config_service.dir_exec, param_file_basename)
def execute_hive(self, hql, return_output=False): # Put HQL statement to a file since it can be longer than allowed length of EMR step parameter. datetime_str = Util.get_formatted_utc_now(EMRSystem.DATETIME_FORMAT) id_str = EMRSystem._generate_random_id() hql_filename = "{}.{}{}".format(datetime_str, id_str, ConfigService.Extensions.HQL) hql_path_local = os.path.join(self.dir_tmp_local, hql_filename) hql_path_s3 = os.path.join(self.dir_tmp_s3, hql_filename) with open(hql_path_local, "w") as hql_file: hql_file.write(hql) self.s3_util.upload_object(hql_path_local, hql_path_s3) # Create hive command line. hive_cmd = "hive --silent -f {}".format(hql_path_s3) # Add step to EMR cluster. step_name = "Hive EMR Step: datetime=\"{}\", id=\"{}\"".format( datetime_str, id_str) emr_step_id = self.emr_cluster_client.add_step(step_name, hive_cmd) self.emr_cluster_client.wait_for_step_completion(emr_step_id) if return_output: output_file = self.emr_cluster_client.get_step_output_path( emr_step_id) logging.info( "Waiting for availability of output file: '{}'.".format( output_file)) self.s3_util.wait_for_file_availability( output_file, self.emr_cluster_client.polling_interval_seconds, EMRClusterClient.AWSConstants. S3_FILE_AVAILABILITY_TIMEOUT_SECONDS) file_content = self.s3_util.read_gzip_file_content(output_file) return file_content return None