class Flume(object): """ Wrapper for Flume command line utility """ LOG = get_logger("Flume") @staticmethod def agent(agent=None, conf_file=None, executor=execute_shell_command): """ Creates wrapper for 'flume-ng agent' command line commands :param agent: name of the agent :type agent: str :param conf_file: path to config file :type conf_file: str :rtype: FlumeAgent """ return FlumeAgent(agent=agent, conf_file=conf_file, executor=executor)
class WebHCatalog(): """ WebHCatalog Client """ URL = "http://{host}/templeton/v1/ddl/database/{database}/table/{table}/property/{property}?user.name={username}" DATA = "{{ \"value\": \"{value}\" }}" HEADERS = {'Content-type': 'application/json'} LOG = get_logger("WebHCatalog") def __init__(self, username, host="localhost", port=None): self.host = "{0}:{1}".format(host, port) if port else host self.username = username def table_properties(self, table, database="default"): """ Returns TableProperties object """ return TableProperties(database=database, table=table, webhcat=self)
def __init__(self, name='LoggingListener'): self.log = get_logger(name)
class SparkApplication(object): """ Wrapper for spark-submit command line utility. Provides simple DSL to configure and launch Spark Application """ LOG = get_logger("Spark") SHELL_COMMAND = "spark-submit" def __init__(self, config=None, name=None, executor=execute_shell_command): """ :param config: configurations :param name: name of the config section containing specific application configurations :param executor: he interface used by the client to launch Spark Application. """ super(SparkApplication, self).__init__() self.executor = executor self._configs = config if config else Configuration.create() self.name = name if name \ else "SPARK_JOB_{0}".format(uuid.uuid4()) @staticmethod def load_preconfigured_job(name=None, config=None, executor=execute_shell_command): """ Creates wrapper for spark-submit command line utility. Configure it with options :param config: spark job configurations :param name: spark job identifier. Will be used as a name of the section with job-specific configurations. :param executor: :return: """ SparkApplication.LOG.info("Loading Spark Job from configuration") return SparkApplication(name=name, config=config, executor=executor) def run(self, *args): """ Submits spark application. :param args: Arguments passed to the main method of your main class, if any """ return self._fire_job(verbose=False, args=args) def _fire_job(self, verbose=False, args=None): _options = [] _options.extend(self._configure_spark_options()) if verbose: _options.append("--verbose") _options.append( self._configs.require( self.name, TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR)) if args: _options.extend(str(arg) for arg in args) return SparkJobStatus(self.executor(self.SHELL_COMMAND, *_options)) def debug(self, *args): """ Submits spark application in a verbose mode. :param args: Arguments passed to the main method of your main class, if any """ return self._fire_job(verbose=True, args=args) def master(self, master): """ Sets the cluster manager :param master: master URL of the clusters :return: """ self._configs.set(section=self.name, key=TaskOptions.SPARK_APP_CONFIG_MASTER, value=master) return self def application(self, application_jar, main_class=None, app_name=None): """ Configures Spark application :param application_jar: Path to a bundled jar including your application and all dependencies :param main_class: Java or Scala classname. Application entry point :param app_name: :return: """ self.application_jar(application_jar) self.main_class(main_class) self.application_name(app_name) return self def config_file(self, path): """ Configures Spark app to load default properties from file. :param path: Path to a file from which to load extra properties. If not specified, this will look for conf/spark-defaults.conf. :return: """ self._configs.set(section=self.name, key=TaskOptions.SPARK_APP_CONFIG_PROPERTIES_FILE, value=path) return self def with_config_option(self, key, value): """ Supplies configuration values at runtime. According to specification, configuration values explicitly set on a SparkConf take the highest precedence, then flags passed to spark-submit, then values in the defaults file. :param key: option name. see https://spark.apache.org/docs/latest/configuration.html for supported properties :param value: option value :return: """ _key_value_pair = "{key}={value}".format(key=key, value=value) self._configs.update_list(self.name, TaskOptions.SPARK_APP_CONFIG_OPTIONS, _key_value_pair) return self def main_class(self, main_class): """ Sets spark application's main class (for Java / Scala apps). :param main_class: :return: """ if main_class: self._configs.set(section=self.name, key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS, value=main_class) return self def application_jar(self, app_jar): """ Sets path to a bundled jar including application with all dependencies. :param app_jar: path to application jar. :return: """ if app_jar: self._configs.set(section=self.name, key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR, value=app_jar) return self def application_name(self, name): """ Sets application's name. This will appear in the UI and in log data. :param name: A name of Spark application. :return: """ if name: self._configs.set(section=self.name, key=TaskOptions.SPARK_APP_CONFIG_APP_NAME, value=name) return self def classpath(self, *jar_files): """ Specifies the list of local jars to include on the driver and executor classpaths. :param jar_files: jar files to be included to application classpath :return: """ self._configs.update_list(self.name, TaskOptions.SPARK_APP_CONFIG_JARS, *jar_files) return self def pythonpath(self, *pyfiles): """ Specifies the list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. :param pyfiles: :return: """ self._configs.update_list(self.name, TaskOptions.SPARK_APP_CONFIG_PYFILES, *pyfiles) return self def add_files(self, *files): """ Adds files to be placed in the working directory of each executor :param files: :return: """ self._configs.update_list(self.name, TaskOptions.SPARK_APP_CONFIG_FILES, *files) return self def _configure_spark_options(self): """ Adds next args to command : --master MASTER_URL spark://host:port, mesos://host:port, yarn, or local. --deploy-mode DEPLOY_MODE Where to run the driver program: either "client" to run on the local machine, or "cluster" to run inside cluster. --class CLASS_NAME Your application's main class (for Java / Scala apps). --name NAME A name of your application. --jars JARS Comma-separated list of local jars to include on the driver and executor classpaths. --py-files PY_FILES Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. --files FILES Comma-separated list of files to be placed in the working directory of each executor. --properties-file FILE Path to a file from which to load extra properties. If not specified, this will look for conf/spark-defaults.conf. --conf PROP=VALUE Arbitrary Spark configuration property. :return: """ _options = [] _section = self.name if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_MASTER): _options.extend([ "--master", self._configs.get(_section, TaskOptions.SPARK_APP_CONFIG_MASTER) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS): _options.extend([ "--class", self._configs.get(_section, TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_APP_NAME): _options.extend([ "--name", self._configs.get(_section, TaskOptions.SPARK_APP_CONFIG_APP_NAME) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_JARS): _options.extend([ "--jars", ",".join( self._configs.get_list(_section, TaskOptions.SPARK_APP_CONFIG_JARS)) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_PYFILES): _options.extend([ "--py-files", ",".join( self._configs.get_list( _section, TaskOptions.SPARK_APP_CONFIG_PYFILES)) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_FILES): _options.extend([ "--files", ",".join( self._configs.get_list(_section, TaskOptions.SPARK_APP_CONFIG_FILES)) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_PROPERTIES_FILE): _options.extend([ "--properties-file", self._configs.get(_section, TaskOptions.SPARK_APP_CONFIG_PROPERTIES_FILE) ]) if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_OPTIONS): _options.extend([ "--conf", "\"{0}\"".format(" ".join( self._configs.get_list( _section, TaskOptions.SPARK_APP_CONFIG_OPTIONS))) ]) return _options
self.log = get_logger(name) self.metrics = {} def on_begin(self, action_name): if 'error' not in action_name: if os.path.isfile('resources/step'): os.remove('resources/step') def on_error(self, action_name, exception): file = open('resources/step', 'w') file.write(action_name) file.close() if __name__ == '__main__': log = get_logger("SCD") # Prepare paths _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig') _scd_active_snapshot = '/tmp/scd.active/scd.active.csv' _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv') _hdfs_job_output = '/tmp/scd.updated' _local_folder_to_monitor = LocalFS( os.path.join(os.path.dirname(__file__), 'resources')) _hdfs_basedir = HDFS('/tmp/scd.active') _hdfs_tmpdir = HDFS('/tmp/scd.tmp') _hdfs_tmpdir.create_directory() if _scd_updates and LocalFS(_scd_updates).exists():
def __init__(self, name='WorkflowFailOverController'): self.log = get_logger(name) self.metrics = {}
def __init__(self, name): super(Workflow, self).__init__() self.name = name self.__action_registry__ = {} self.log = get_logger(self.name)
self.log = get_logger(name) self.metrics = {} def on_begin(self, action_name): if 'error' not in action_name: if os.path.isfile('resources/step'): os.remove('resources/step') def on_error(self, action_name, exception): file = open('resources/step', 'w') file.write(action_name) file.close() if __name__ == '__main__': log = get_logger("SCD") # Prepare paths _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig') _scd_active_snapshot = '/tmp/scd.active/scd.active.csv' _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv') _hdfs_job_output = '/tmp/scd.updated' _local_folder_to_monitor = LocalFS(os.path.join(os.path.dirname(__file__), 'resources')) _hdfs_basedir = HDFS('/tmp/scd.active') _hdfs_tmpdir = HDFS('/tmp/scd.tmp') _hdfs_tmpdir.create_directory() if _scd_updates and LocalFS(_scd_updates).exists(): # Checks if file with last failed step is exists
class MapReduce(object): LOG = get_logger("MapReduce") def __init__(self, name, config, executable, executor, main_class=None, shell_command="hadoop jar"): self.executor = executor self.executable = executable self._config = config if config else Configuration.create( readonly=False, accepts_nulls=True) self.name = name if name else "MR_TASK_{0}".format(uuid.uuid4()) self.main_class = main_class self._shell_command = shell_command self._process = None @staticmethod def prepare_streaming_job(config=None, name=None, jar="hadoop-streaming.jar", executor=execute_shell_command): """ Creates instance of StreamingJob :param name: name of job :param jar: executing jar :param executor: interface used by the client to run command. :return: StreamingJob template :rtype : StreamingJob """ MapReduce.LOG.info("MapReduce streaming job") config = config if config else Configuration.create(readonly=False, accepts_nulls=True) MapReduce.__validate_configs(config, name, "StreamingJob", TaskOptions.KEYS_FOR_MAPREDUCE) return StreamingJob( config=config, name=name if name else "MR_STREAMING_JOB_{0}".format(uuid.uuid4()), jar=jar, executor=executor) @staticmethod def prepare_mapreduce_job(jar, main_class=None, config=None, name=None, executor=execute_shell_command): """ Creates instance of MapReduceJob :param name: name of job :param jar: executing jar :param executor: interface used by the client to run command. :return: MapReduceJob template :rtype : MapReduceJob """ MapReduce.LOG.info("MapReduce job") config = config if config else Configuration.create(readonly=False, accepts_nulls=True) MapReduce.__validate_configs(config, name, "MapReduceJob", TaskOptions.KEYS_FOR_STREAMING_JOB) return MapReduceJob( name=name if name else "MR_JOB_{0}".format(uuid.uuid4()), config=config, jar=jar, main_class=main_class, executor=executor) @staticmethod def __validate_configs(config, name, type_of_job, keys): """ Logs warning for set incorrect keys in .INI file for custom job """ for key in keys: if config.has(name, key): MapReduce.LOG.warning("{0} does not use this key: {1}.".format( type_of_job, key)) def run(self, *args): """ Runs specific MapReduce Job :param args: specific argument to CLI for MapReduceJob :rtype: """ if args: if isinstance(self, StreamingJob): MapReduce.LOG.warning("StreamingJob does not use args.") else: self._update_list_config_(TaskOptions.COMMAND_ARGS, *args) command, arguments = self.__configure_command__() self._process = self.executor(command, *arguments) return self._process def status(self): """ Returns status of finished job :return: """ return None if self._process.is_running() \ else JobStatus(JobStatus.job_id(self._process.stderr)) def with_number_of_reducers(self, reducer_num): """ Streaming MR job has it's own command parameter to set number of reducers. Overrides base method to ignore 'mapreduce.job.reduces' configuration option :param reducer_num: :return: """ return self.with_config_option( TaskOptions.CONFIG_KEY_MR_JOB_REDUCER_NUM, reducer_num) def disable_reducers(self): return self.with_number_of_reducers(0) def __configure_command__(self): """Overrides this method to configure MR job""" if not os.path.isfile(self.executable): raise MapReduceConfigurationError("{0} doesn't exist".format( self.executable)) arguments = [self.executable] if self.main_class is not None: arguments.append(self.main_class) arguments.extend(self._generic_options_()) arguments.extend(self._command_options_()) return self._shell_command, arguments def load_configuration_from(self, _file): """ Specifies an application configuration file. :param _file: """ self._config[TaskOptions.CONFIG_KEY_MR_JOB_CONF_FILE] = _file return self def use_jobtracker(self, jobtracker): """ Specifies an application jobtracker. :param jobtracker: """ self._config[TaskOptions.CONFIG_KEY_MR_JOBTRACKER] = jobtracker return self def _generic_options_(self): """ Adds generic option to hadoop command. -conf <configuration file> -D <property>=<value> -jt <local> or <jobtracker:port> Specify a job tracker. -files <comma separated list of files> Specify comma separated files to be copied to the map reduce cluster. -libjars <comma separated list of jars> Specify comma separated jar files to include in the classpath. -archives <comma separated list of archives> Specify comma separated archives to be unarchived on the compute machines. Applications should implement Tool to support GenericOptions. :return: """ options = [] # Specify an application configuration file if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CONF_FILE): options.extend( ['--conf', self.get(TaskOptions.CONFIG_KEY_MR_JOB_CONF_FILE)]) # Add or override MR job options options.extend( ['-D', '='.join([TaskOptions.CONFIG_KEY_MR_JOB_NAME, self.name])]) if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CONF_OPTION): options.extend("-D {0}".format(att) for att in self.get_list( TaskOptions.CONFIG_KEY_MR_JOB_CONF_OPTION)) # Specify an application jobtracker if self.has_option(TaskOptions.CONFIG_KEY_MR_JOBTRACKER): options.extend( ['--jt', self.get(TaskOptions.CONFIG_KEY_MR_JOBTRACKER)]) # comma separated files to be copied to the map reduce cluster if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_FILE): options.extend([ '-files', ",".join( self.get_list(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_FILE)) ]) # comma separated jar files to include in the classpath if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_LIBJARS): options.extend([ '-libjars', ",".join(self.get_list(TaskOptions.CONFIG_KEY_MR_JOB_LIBJARS)) ]) # comma separated archives to be unarchived on the compute machines if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_ARCHIVE): options.extend([ '-archives', ",".join( self.get_list(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_ARCHIVE)) ]) return options def _command_options_(self): options = [] return options def _update_list_config_(self, _key, *values): _inputs = self._config.get(self.name, _key) if self._config.has(self.name, _key) \ else [] _inputs.extend(values) return self._update_config_option_(_key, _inputs) def _update_config_option_(self, key, value): self._config.set(self.name, key, value) return self def with_config_option(self, key, value): """ Adds or updates job configuration variable. In case java-based MR job options will be passed into configuration object :param key: variable name :param value: variable value :return: """ return self._update_list_config_( TaskOptions.CONFIG_KEY_MR_JOB_CONF_OPTION, "{0}={1}".format(key, value)) def use_jars(self, *libs): """ Adds jar files to be included in the classpath :param libs: jar that should be placed in distributed cache and will be made available to all of the job's task attempts. :return: """ return self._update_list_config_(TaskOptions.CONFIG_KEY_MR_JOB_LIBJARS, *libs) def cache_files(self, *files): """ Adds files which will be copied to the Map/Reduce cluster :param files: The list of files that need to be added to distributed cache :return: """ return self._update_list_config_( TaskOptions.CONFIG_KEY_MR_JOB_CACHE_FILE, *files) def cache_archives(self, *archives): """ Adds archives which will be copied to the Map/Reduce cluster :param files: The list of archives that need to be added to distributed cache :return: """ return self._update_list_config_( TaskOptions.CONFIG_KEY_MR_JOB_CACHE_ARCHIVE, *archives) def has_option(self, key): """ Checks if job configuration contains specified option :param key: option name :return: True option value """ return self._config.has(self.name, key) def get(self, key, required=False): """ Gets the value of the specified configuration option. :param key: option name :param required: Boolean flag, True is option is required :return: option value or None in case option was not found and option is not required. ConfigurationError will be thrown in case required option was not found within current configuration """ return self._config.require(self.name, key) if required \ else self._config.get(self.name, key) def get_list(self, key, required=False): """ Gets the value of the specified configuration option property as a list :param key: option name :param required: True is option is required :return: property value as a list of strings or None in case option was not found and option is not required. ConfigurationError will be thrown in case required option was not found within current configuration """ return self._config.require_list(section=self.name, key=key) if required \ else self._config.get_list(section=self.name, key=key)
Standard scenario of ETL process. Flow imports data from mysql to HDFS, process it and upload processed data back to mysql. """ import os from merlin.common.exceptions import MapReduceJobException from merlin.common.logger import get_logger from merlin.flow.flow import FlowRegistry, Workflow from merlin.flow.listeners import WorkflowListener, LoggingListener from merlin.tools.mapreduce import MapReduce from merlin.tools.sqoop import Sqoop BASE_DIR = "/tmp" LOG = get_logger("SimpleETLFlow") # Imports data from mysql's table 'test_example.first_table_name'(id,name,count) # to HDFS's folder '/tmp/data_from_import' in "'id','name','count'" format. @Workflow.action(flow_name='Flow', action_name='Sqoop import etl step', on_success='MapReduce job etl step', on_error='error') def load_data_from_rdbms_to_hdfs(context): # configure Sqoop import job _sqoop_import_job_ = Sqoop.import_data().from_rdbms( host="127.0.0.1", rdbms="mysql", database="test_example", username="******",
Compares them and get only new files on FTP that don't exist on HDFS. Download new files to HDFS with partition. """ from ConfigParser import RawConfigParser import os from merlin.common.logger import get_logger from merlin.flow.flow import Workflow, FlowRegistry from merlin.flow.listeners import LoggingListener, WorkflowListener from merlin.fs.ftp import ftp_client from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS from merlin.fs.utils import FileUtils from merlin.tools.hive import Hive BASE_DIR = "/tmp/base_folder" log = get_logger("MonitoringFTP") config = RawConfigParser() config.read(os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini")) HOST_DOWNLOAD = config.get("ftp", "host.download") USER_NAME = config.get("ftp", "user.name") PASSWORD = config.get("ftp", "password") PATH = config.get("ftp", "path") def get_name(path): if not hasattr(path, 'name'): raise TypeError('FileDescriptor is required. ' 'Cannot extract file name from {0}'.format(path.__class__)) return path.name.split("/")[-1]
class Kafka(): """ Wrapper for Kafka command line scripts """ RUN_SHELL = "kafka-run-class.sh" LOG = get_logger("Kafka") @staticmethod def run_consumer(name, args, executor=execute_shell_command, kafka_run_class=RUN_SHELL): """ Runs specific consumer. Executing command: kafka-run-class.sh {name} {configs} :param name: :param args: :param executor: :return: """ command = "{0} {1}".format(kafka_run_class, name) if args: command += Kafka.__get_configs(args) return Kafka.__run(executor, command) @staticmethod def __get_configs(configs): command = "" if isinstance(configs, list): for value in configs: command += " {0}".format(value) elif isinstance(configs, str): command += " {0}".format(configs) elif isinstance(configs, dict): for key, value in configs.iteritems(): command += " {0} {1}".format(key, value) return command @staticmethod def __get_configs_topics(configs): command = "" if isinstance(configs, list): for value in configs: command += " --config {0}".format(value) elif isinstance(configs, str): for value in configs.split(","): command += " --config {0}".format(value) elif isinstance(configs, dict): for key, value in configs.iteritems(): command += " --config {0}={1}".format(key, value) return command @staticmethod def run_producer(name, args, executor=execute_shell_command, kafka_run_class=RUN_SHELL): """ Runs specific producer. Executing command: kafka-run-class.sh {name} {configs} :param name: :param args: :param executor: :return: """ command = "{0} {1}".format(kafka_run_class, name) if args: command += Kafka.__get_configs(args) return Kafka.__run(executor, command) @staticmethod def start_broker(path_to_config, executor=execute_shell_command, kafka_run_class="kafka-server-start.sh"): """ Runs broker using configuration file. Executing command: kafka-server-start.sh {path_to_config} :param path_to_config: :param executor: :return: """ command = "{0} {1}".format(kafka_run_class, path_to_config) return Kafka.__run(executor, command) @staticmethod def stop_broker(path_to_config, executor=execute_shell_command, kafka_run_class="kafka-server-stop.sh"): """ Runs broker using configuration file. Executing command: kafka-server-stop.sh {path_to_config} :param path_to_config: :param executor: :return: """ command = "{0} {1}".format(kafka_run_class, path_to_config) return Kafka.__run(executor, command) @staticmethod def create_topic(name, replication_factor=None, replica_assignment=None, partitions=1, zookeeper_host=None, args=None, executor=execute_shell_command, kafka_run_class=RUN_SHELL): """ Creates topic :param name: :param replication_factor: :param replica_assignment: :param partitions: :param zookeeper_host: :param args: :param executor: :return: """ command = "{0} kafka.admin.TopicCommand --create --zookeeper {2} --topic {1} --partitions {3}" \ .format(kafka_run_class, name, zookeeper_host, partitions) if replication_factor: command += " --replication-factor {0}".format(replication_factor) if replica_assignment: command += " --replication-assignment {0}".format( replica_assignment) if args: command += Kafka.__get_configs_topics(args) Kafka.__run(executor, command) return Topic(name, zookeeper_host, executor) @staticmethod def get_list_topics(zookeeper_host=None, executor=execute_shell_command, kafka_run_class=RUN_SHELL): """ Returns existing list of topics on zookeeper :param zookeeper_host: :param executor: :return: """ command = "{0} kafka.admin.TopicCommand --zookeeper {1} --list" \ .format(kafka_run_class, zookeeper_host) topics = [] for t in Kafka.__run(executor, command).stdout.split('\n'): topics.append(Topic(t, zookeeper_host)) return topics @staticmethod def __run(executor, command): Kafka.LOG.info("Executing Kafka command: {0}".format(command)) return executor(command)
class Topic(): LOG = get_logger("Topic") def __init__(self, name, zookeeper_host, executor=execute_shell_command): """ Wrapper for Kafka command #./bin/kafka-run-class.sh kafka.admin.TopicCommand :param name: :param zookeeper_host: :param executor: :return: """ self.name = name self.zookeeper_host = zookeeper_host self._executor = executor def get_metadata(self, kafka_run_class=Kafka.RUN_SHELL): """ Returns metadata of topic. Executing command: #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --describe --zookeeper {host:port} :return: """ return self.__run("--topic {0} --describe".format(self.name), kafka_run_class).stdout def add_config(self, key, value, kafka_run_class=Kafka.RUN_SHELL): """ Adds config to topic. Executing command: #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --alter --zookeeper {host:port} config {key=value} :param key: :param value: :return: """ return self.__run( "--topic {0} --alter config {1}={2}".format(self.name, key, value), kafka_run_class) def delete_config(self, key, value, kafka_run_class=Kafka.RUN_SHELL): """ Deletes config from topic. Executing command: #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --alter --zookeeper {host:port} deleteConfig {key=value} :param key: :param value: :return: """ return self.__run( "--topic {0} --alter deleteConfig {1}={2}".format( self.name, key, value), kafka_run_class) def delete(self, kafka_run_class=Kafka.RUN_SHELL): """ Deletes topic. Executing command: #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --delete --zookeeper {host:port} :return: """ return self.__run("--topic {0} --delete".format(self.name), kafka_run_class) def is_exists(self, kafka_run_class=Kafka.RUN_SHELL): """ Returns True if topic exist else - False. Executing command: #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --list --zookeeper {host:port} :return: """ result = self.__run("--list", kafka_run_class) topics = result.stdout.split('\n') return str(self.name) in topics def __run(self, command, kafka_run_class): Topic.LOG.info("Executing Topic command") result = self._executor( "{0} kafka.admin.TopicCommand".format(kafka_run_class), "--zookeeper", self.zookeeper_host, command) return result
def __init__(self): super(FsSnapshot, self).__init__() self.logger = get_logger(self.__class__.__name__) self.files = {}
def __init__(self, name='ProfilingListener'): self.log = get_logger(name) self.metrics = {}
class DistCp(object): """Hadoop's command distcp utilities.""" LOG = get_logger("DistCP") def __init__(self, executor=execute_shell_command): """ Creates a new DistCp instance :param executor: command executor :type executor: :rtype : DistCp """ self.preserve = "-p" self.strategy = None self.mappers = None self.synchronize = False self.path_src = None self.path_dest = None self.__executor = executor def run(self): """ Runs DistCp Job :rtype: Result """ DistCp.LOG.info("Running DistCp Job") _process = self.__executor('hadoop distcp', self.build()) _process.if_failed_raise(DistCpError("DistCp Job failed")) return _process def build(self): """ Builds DistCp command :rtype: str """ list_attributes = [self.preserve] if self.mappers: list_attributes.append(self.mappers) if self.strategy: list_attributes.append(self.strategy) if self.synchronize: list_attributes.append("-delete") if self.path_src: list_attributes.append(self.path_src) else: raise DistCpError("You must specify source that will be copied") if self.path_dest: list_attributes.append(self.path_dest) else: raise DistCpError( "You must specify destination where will saved file") return " ".join(list_attributes) def take(self, path): """ Specifies the directory or file on file system which will be copied. Exception will be raised in case the directory or file isn't exists on file system :param path: path to source which should be copied :type path: str :rtype: DistCp """ self.path_src = path return self def copy_to(self, path): """ Specifies the directory or file on file system into which the data should be copied. :param path: path to destination into which the data should be copied :type path: str :rtype: DistCp """ self.path_dest = path return self def use(self, mappers=None): """ Specifies number of mappers :param mappers: number of map tasks :type mappers: str, int :rtype: DistCp """ self.mappers = "-m {0}".format(str(mappers)) return self def update_destination(self, synchronize=False): """ Changes command strategy to update :param synchronize: synchronizes source with destination if param is True :type synchronize: bool :rtype: DistCp """ self.strategy = "-update" if synchronize: self.synchronize = True return self def overwrite_destination(self, synchronize=False): """ Changes command strategy to overwrite :param synchronize: synchronizes source with destination if param is True :type synchronize: bool :rtype: DistCp """ self.strategy = "-overwrite" if synchronize: self.synchronize = True return self def preserve_replication_number(self): """ Sets replication number of file in destination equals to replication number of file in source :rtype: DistCp """ self.__set_preserve('r') return self def preserve_block_size(self): """ Sets block size of file in destination equals to block size of file in source :rtype: DistCp """ self.__set_preserve('b') return self def preserve_user(self): """ Sets user of file in destination equals to user of file in source :rtype: DistCp """ self.__set_preserve('u') return self def preserve_permission(self): """ Sets permission of file in destination equals to permission of file in source :rtype: DistCp """ self.__set_preserve('p') return self def preserve_group(self): """ Sets group of file in destination equals to group of file in source :rtype: DistCp """ self.__set_preserve('g') return self def preserve_checksum_type(self): """ Sets checksum type of file in destination equals to checksum type of file in source :rtype: DistCp """ self.__set_preserve('c') return self def __set_preserve(self, value): """ :return: """ if value not in self.preserve: self.preserve = "{0}{1}".format(self.preserve, value)
class JobStatus(object): """ Describes the current status of a job. """ COUNTER_SECTION = 'COUNTER' LOG = get_logger("MapReduceJobStatus") CLI_COMMAND = 'hadoop job' def __init__(self, job_id, executor=execute_shell_command): super(JobStatus, self).__init__() self.job_id = job_id self._executor = executor self.job_stats = None def state(self): """ Returns the current state of the Job. :return: string value for job state. Possible values : FAILED, KILLED, PREP, RUNNING, SUCCEEDED """ return self.stats()['Job state'] @staticmethod def job_id(stderr): """ Parses MR job stderr to get job id. :return: job id """ _job_id = None for line in stderr.splitlines(): if 'Running job:' in line: _job_id = str(line).rsplit(':', 1)[1].strip() JobStatus.LOG.info("Job id : {0}".format(_job_id)) break if not _job_id: JobStatus.LOG.info("Cannot get job id") return _job_id def counters(self): """ Gets the counters for this job. :return: all job counters in format {counter_group :{counter_name : counter_value}} """ return self.stats()[JobStatus.COUNTER_SECTION] def counter(self, group, counter): """ Gets the value of the specific job counter. :param group: :param counter: :return: the value for the specific counter """ _counters = self.counters() return int( _counters[group][counter] ) if group in _counters and counter in _counters[group] else None def stats(self): """ Gets aggregate job statistics, which includes: - job id - job file - job tracking URL - number of maps/reduces - map()/reduce() completion - job state - reason for failture - job counters - etc :return: job details """ if not self.job_stats: _result = self._executor(self.CLI_COMMAND, '-status', self.job_id) _result.if_failed_raise( CommandException("cannot get map reduce job status")) self._parse_stdout_(_result.stdout) return self.job_stats def is_failed(self): """ Checks if the job failed. :return: """ return 'FAILED' == self.state() def is_killed(self): """ Checks if the job process was killed. :return: """ return 'KILLED' == self.state() def is_succeeded(self): """ Checks if the job completed successfully. :return: """ return self.state() == 'SUCCEEDED' def is_running(self): """ Checks if the job is finished or not. :return: True if the job has running or prep state """ return self.state() in ['PREP', 'RUNNING'] def failure_reason(self): """ Gets any available info on the reason of failure of the job. :return: diagnostic information on why a job might have failed. """ return None if not self.is_failed() else self.stats( )['reason for failure'] def _parse_stdout_(self, stream): """ Parses hadoop jar -status <job_id> output stream to get job stats :param stream: stream containing job stats data :return: dictionary containing job stats """ _counter_group = None _job_metrics = {JobStatus.COUNTER_SECTION: {}} for line in stream.splitlines(): is_counter = re.match('\A\t\t\w', line) is_counter_header = not is_counter and re.match('\A\t\w', line) key_value = [ part.strip() for part in line.split("=" if is_counter else ":", 1) ] if is_counter_header: _counter_group = line.strip() elif is_counter: if not _counter_group in _job_metrics[ JobStatus.COUNTER_SECTION]: _job_metrics[ JobStatus.COUNTER_SECTION][_counter_group] = {} _job_metrics[JobStatus.COUNTER_SECTION][_counter_group][ key_value[0]] = key_value[1] elif len(key_value) > 1: _job_metrics[key_value[0]] = key_value[1] self.job_stats = _job_metrics
class Pig(object): """ Wrapper for pig command line utility. Provides logic to configure and launch Pig scripts""" LOG = get_logger("Pig") @staticmethod def load_commands_from_file(path, command_executor=execute_shell_command): """ Creates an instance of Pig client. Configures Pig client to run commands from specified script file. :param path: path to the script to execute :param command_executor: The interface used by the client to run command. :type path: str :rtype: Pig """ Pig.LOG.info("Loading Pig script from file : {0}".format(path)) _config = Configuration.create(readonly=False, accepts_nulls=True) _job_name = "PIG_TASK_{0}".format(uuid.uuid4()) _pig = Pig(config=_config, job_name=_job_name, command_executor=command_executor) _pig.execute_script(path=path) return _pig @staticmethod def load_commands_from_string(commands, command_executor=execute_shell_command): """ Creates an instance of Pig client. Configures Pig client to parse and run commands from string. :param commands: Commands to execute (within quotes) :param command_executor: The interface used by the client to run command. :type commands: str :rtype: Pig """ _config = Configuration.create(readonly=False, accepts_nulls=True) _job_name = "PIG_TASK_{0}".format(uuid.uuid4()) _pig = Pig(config=_config, job_name=_job_name, command_executor=command_executor) _pig.execute_commands(commands=commands) return _pig @staticmethod def load_preconfigured_job(config, job_name, command_executor=execute_shell_command): """ Creates a pre-configured instance of the Pig client. :param config: pig job configurations :param job_name: ig job identifier. Will be used as a name of the section with job-specific configurations. :param command_executor: :return: """ return Pig(config=config, job_name=job_name, command_executor=command_executor) def __init__(self, config, job_name, command_executor): self._config = config self._job_name = job_name self._command_executor = command_executor def __add_config_option__(self, key, value): """Facade method used to add new options to job-specific section of the configuration""" self._config.set(section=self._job_name, key=key, value=value) def _has_config_option_(self, key): """Facade method used to check if option with specified name is exist in configuration""" return self._config.has(section=self._job_name, key=key) def _get_config_option_(self, key): """Facade method used to get option from configuration""" return self._config.get( section=self._job_name, key=key) def _wrap_with_quotes_(self, value): """Wraps string with quotes: single or double""" if not value or value[0] in ['"', "'"]: return value _template = "'{}'" if '"' in value else '"{}"' return _template.format(value) def execute_script(self, path): """ Specifies file containing script to execute. :param path: Path to the script to execute Will be passed to command executor as a value of -file option :rtype: Pig """ if path: self.__add_config_option__(TaskOptions.CONFIG_KEY_SCRIPT_FILE, path) return self def execute_commands(self, commands): """ Specifies commands to execute :param commands: Commands to execute (within quotes) :rtype: Pig """ if commands: self.__add_config_option__(TaskOptions.CONFIG_KEY_COMMANDS_STRING, commands) return self def _configure_command_(self): """Adds pig commands to cli call.""" if self._has_config_option_(TaskOptions.CONFIG_KEY_SCRIPT_FILE): return ['-f', self._wrap_with_quotes_( self._get_config_option_(key=TaskOptions.CONFIG_KEY_SCRIPT_FILE) )] elif self._has_config_option_(TaskOptions.CONFIG_KEY_COMMANDS_STRING): return ['-e', self._wrap_with_quotes_( self._get_config_option_(key=TaskOptions.CONFIG_KEY_COMMANDS_STRING) )] else: raise PigCommandError( "Failed to configure command : one of {} or {} is required".format( TaskOptions.CONFIG_KEY_SCRIPT_FILE, TaskOptions.CONFIG_KEY_COMMANDS_STRING) ) def _configure_pig_options_(self, verbose=False): """Parse job specific configurations and builds arguments to be passed to CLI call""" _options = [] if verbose: _options.append('-verbose') _options.extend(self.__configure_logging__()) self.__add_command_arg__("-param_file", TaskOptions.CONFIG_KEY_PARAMETER_FILE, _options) if self._has_config_option_(TaskOptions.CONFIG_KEY_PARAMETER_VALUE): _params = self._config.get_list(self._job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE) if _params: _options.extend(["-param {}".format(param) for param in _params]) self.__add_command_arg__("-propertyFile", TaskOptions.CONFIG_KEY_PROPERTIES_FILE, _options) if self._has_config_option_(TaskOptions.CONFIG_KEY_EXECTYPE): _options.append("-x {}".format(self._get_config_option_(TaskOptions.CONFIG_KEY_EXECTYPE))) _options.extend(self._disable_optimizations_()) return _options def __configure_logging__(self): """add logging configurations to cli call""" _logging_options_ = [] self.__add_command_arg__("-log4jconf", TaskOptions.CONFIG_KEY_LOG4J, _logging_options_) self.__add_command_arg__("-logfile", TaskOptions.CONFIG_KEY_LOG_FILE, _logging_options_) self.__add_command_marker_arg("-brief", TaskOptions.CONFIG_KEY_LOG_BRIEF, _logging_options_) self.__add_command_marker_arg("-warning", TaskOptions.CONFIG_KEY_LOG_WARNING, _logging_options_) self.__add_command_marker_arg("-debug", TaskOptions.CONFIG_KEY_LOG_DEBUG, _logging_options_) return _logging_options_ def _disable_optimizations_(self): """add cli call args to disable Pig Job optimizations""" _optimizations = [] _optimizations.extend( self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_SPLIT_FILTER) ) _optimizations.extend( self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_PUSHUP_FILTER) ) _optimizations.extend( self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_MERGE_FILTER) ) _optimizations.extend( self._configure_optimization_rule( TaskOptions.CONFIG_KEY_DISABLE_PUSHDOWN_FOREACH_FLATTEN ) ) _optimizations.extend( self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_LIMIT_OPTIMIZER) ) _optimizations.extend( self._configure_optimization_rule( TaskOptions.CONFIG_KEY_DISABLE_COLUMN_MAP_KEY_PRUNE ) ) _optimizations.extend( self._configure_optimization_rule( TaskOptions.CONFIG_KEY_DISABLE_ADD_FOREACH ) ) _optimizations.extend( self._configure_optimization_rule( TaskOptions.CONFIG_KEY_DISABLE_MERGE_FOREACH ) ) _optimizations.extend( self._configure_optimization_rule( TaskOptions.CONFIG_KEY_DISABLE_GROUPBY_CONST_PARALLEL_SETTER ) ) _optimizations.extend( self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_ALL) ) if self.is_optimization_disabled(TaskOptions.CONFIG_KEY_DISABLE_MULTIQUERY): _optimizations.append('-no_multiquery') return _optimizations def _configure_optimization_rule(self, rule_name): """build cli parameter to disable specific optimization rule""" return ['-optimizer_off', rule_name] if self.is_optimization_disabled(rule_name) else [] def __add_command_arg__(self, name, config_key, args=list()): """adds argument to cli call""" if self._has_config_option_(config_key): args.extend([name, self._get_config_option_(config_key)]) def __add_command_marker_arg(self, name, config_key, args=list()): """adds marker argument (argument without value) to cli call""" if self._has_config_option_(config_key) and self._get_config_option_(config_key): args.append(name) def log_config(self, logfile=None, debug=False, warning=False, brief=False): """ Adds and configures custom logger for Pig's script :param logfile: to file, that will have logs from Pig Job :param debug: Enables debug level. Default it is False :param warning: Enables warning level. Default it is False Also turns warning aggregation off :param brief: Enables Brief logging (no timestamps). Default it is False :type logfile: str :type debug: bool :type warning bool :type brief: bool :rtype: Pig """ self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_FILE, logfile) if debug: self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_DEBUG, "enabled") if warning: self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_WARNING, "enabled") if brief: self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_BRIEF, "enabled") return self def log4j_config(self, path): """ Specify Log4j configuration file, overrides log conf :param path: path to file with log4j parameters for Pig Job :return: """ if path: self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG4J, path) return self def with_parameter(self, key, value): """ Sets parameter for Pig Job script in the next format: name=value :param key: key to parameter :param value: value of parameter :type key: str :type value: str :rtype: Pig """ self._config.update_list(self._job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE, "{}={}".format(key, value)) return self def load_parameters_from_file(self, path): """ Specifies file with parameters :param path: Path to the parameter file :return: """ self.__add_config_option__(TaskOptions.CONFIG_KEY_PARAMETER_FILE, path) return self def with_property_file(self, path): """ Sets file with properties at the given path :param path: to file with properties for Pig Job :type path: str :rtype: Pig """ self.__add_config_option__(TaskOptions.CONFIG_KEY_PROPERTIES_FILE, path) return self def without_split_filter(self): """ Job will run without optimization 'Split filter conditions' Optimization split filter condition to allow push filter more aggressively. e.g.: D = FILTER C BY a1>0 and b1>0; will be splitted into: X = FILTER C BY a1>0; D = FILTER X BY b1>0; """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_SPLIT_FILTER, 'disable') return self def without_pushup_filter(self): """ Job will run without optimization 'Early Filters' The objective of this optimization rule is to push the FILTER operators up the data flow graph. As a result, the number of records that flow through the pipeline is reduced. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_PUSHUP_FILTER, 'disable') return self def without_merge_filter(self): """ Job will run without optimization 'Merge filter conditions' This rule used to merge filter conditions after PushUpFilter rule to decrease the number of filter statements. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_MERGE_FILTER, 'disable') return self def without_push_down_foreach_flatten(self): """ Job will run without optimization 'Join or explode as late as possible' The objective of this rule is to reduce the number of records that flow through the pipeline by moving FOREACH operators with a FLATTEN down the data flow graph. """ self.__add_config_option__( TaskOptions.CONFIG_KEY_DISABLE_PUSHDOWN_FOREACH_FLATTEN, 'disable' ) return self def without_limit_optimizer(self): """ Job will run without optimization 'Limit as early as possible' The objective of this rule is to push the LIMIT operator up the data flow graph. In addition, for top-k (ORDER BY followed by a LIMIT) the LIMIT is pushed into the ORDER BY. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_LIMIT_OPTIMIZER, 'disable') return self def without_column_map_key_prune(self): """ Job will run without optimization 'Remove unused data' Prune the loader to only load necessary columns. The performance gain is more significant if the corresponding loader support column pruning and only load necessary columns. Otherwise, ColumnMapKeyPrune will insert a ForEach statement right after loader. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_COLUMN_MAP_KEY_PRUNE, 'disable') return self def without_add_foreach(self): """ Job will run without optimization 'Add ForEach to remove unneeded columns' Prune unused column as soon as possible. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_ADD_FOREACH, 'disable') return self def without_merge_foreach(self): """ Job will run without optimization 'Merge adjacent ForEach' The objective of this rule is to merge together two foreach statements, if these preconditions are met: - The foreach statements are consecutive. - The first foreach statement does not contain flatten. - The second foreach is not nested. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_MERGE_FOREACH, 'disable') return self def without_groupby_const_parallel_setter(self): """ Job will run without optimization 'Force parallel 1 for "group all" statement' Force parallel "1" for "group all" statement. That's because even if we set parallel to N, only 1 reducer will be used in this case and all other reducer produce empty result. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_GROUPBY_CONST_PARALLEL_SETTER, 'disable') return self def without_multiquery(self): """ Turns off multi query optimization. Default multi query optimization is turned on :rtype: Pig """ self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_MULTIQUERY, 'disable') return self def disable_all_optimizations(self): """Disables all optimizations""" self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_ALL, 'disable') return self def run(self, debug=False): """ Runs Pig Job :rtype: Result """ Pig.LOG.info("Running Pig Job") command_args = self._configure_pig_options_(debug) + self._configure_command_() return self._command_executor('pig', *command_args) def debug(self): """Runs Pig script in debug mode.""" return self.run(debug=True) def is_optimization_disabled(self, optimization_config_key, disable_marker='disable'): """ Checks is specified optimization is disabled. By default optimization, and all optimization rules, are turned on. :param optimization_config_key: :param disable_marker: :return: """ return self._has_config_option_(optimization_config_key) \ and disable_marker == self._get_config_option_(optimization_config_key) def using_mode(self, type="mapreduce"): """ Sets execution mode, default is mapreduce. """ self.__add_config_option__(TaskOptions.CONFIG_KEY_EXECTYPE, type) return self
Compares them and get only new files on FTP that don't exist on HDFS. Download new files to HDFS with partition. """ from ConfigParser import RawConfigParser import os from merlin.common.logger import get_logger from merlin.flow.flow import Workflow, FlowRegistry from merlin.flow.listeners import LoggingListener, WorkflowListener from merlin.fs.ftp import ftp_client from merlin.fs.hdfs import HDFS from merlin.fs.localfs import LocalFS from merlin.fs.utils import FileUtils from merlin.tools.hive import Hive BASE_DIR = "/tmp/base_folder" log = get_logger("MonitoringFTP") config = RawConfigParser() config.read(os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini")) HOST_DOWNLOAD = config.get("ftp", "host.download") USER_NAME = config.get("ftp", "user.name") PASSWORD = config.get("ftp", "password") PATH = config.get("ftp", "path") def get_name(path): if not hasattr(path, 'name'): raise TypeError('FileDescriptor is required. ' 'Cannot extract file name from {0}'.format( path.__class__))
class Hive(object): """ Wrapper for Hive command line utility """ LOG = get_logger('Hive') @staticmethod def load_queries_from_file(path, executor=execute_shell_command): """ Creates wrapper for hive command line utility with execute query from file :param path: to file with query for Hive Job :type path: str :rtype: Hive """ Hive.LOG.info("Loading Hive queries from file : {0}".format(path)) hive = Hive(executor=executor) hive.execute_script(path) return hive @staticmethod def load_queries_from_string(query, executor=execute_shell_command): """ Creates wrapper for hive command line utility with execute query from string :param query: HiveQL's query for executing :type query: str :rtype: Hive """ Hive.LOG.info("Loading Hive queries from string : {0}".format(query)) hive = Hive(executor=executor) hive.execute_commands(query) return hive @staticmethod def load_preconfigured_job(name=None, config=None, executor=execute_shell_command): """ Creates wrapper for hive command line utility. Configure it with options :param config: hive job configurations :param name: hive job identifier. Will be used as a name of the section with job-specific configurations. :param executor: :return: """ Hive.LOG.info("Loading Hive queries from configuration") return Hive(name=name, config=config, executor=executor) def __init__(self, name=None, config=None, executor=execute_shell_command): """ Creates wrapper for Hive command line utility :param executor: custom executor :type executor: """ super(Hive, self).__init__() self.name = name if name else "HIVE_TASK_{0}".format(uuid.uuid4()) self.__executor = executor self._config = config if config else Configuration.create( readonly=False, accepts_nulls=True) def _wrap_with_quotes_(self, value): if not value or value[0] in ['"', "'"]: return value return "\"{0}\"".format(value) def execute_script(self, path): """ Specifies file containing script to execute. :param path: Path to the script to execute Will be passed to command executor as a value of -f option :rtype: Hive """ if path: self.__set_config(TaskOptions.CONFIG_KEY_QUERY_FILE, None, path) return self def execute_commands(self, commands): """ Specifies commands to execute :param commands: Commands to execute (within quotes) :rtype: Hive """ if commands: self.__set_config(TaskOptions.CONFIG_KEY_COMMANDS_STRING, None, commands) return self def _configure_command_(self): if self.has_option(TaskOptions.CONFIG_KEY_QUERY_FILE): return [ '-f', self._wrap_with_quotes_( self._config.get(section=self.name, key=TaskOptions.CONFIG_KEY_QUERY_FILE)) ] elif self.has_option(TaskOptions.CONFIG_KEY_COMMANDS_STRING): return [ '-e', self._wrap_with_quotes_( self._config.get( section=self.name, key=TaskOptions.CONFIG_KEY_COMMANDS_STRING)) ] else: raise HiveCommandError( "Failed to configure command : one of {0} or {0} is required". format(TaskOptions.CONFIG_KEY_QUERY_FILE, TaskOptions.CONFIG_KEY_COMMANDS_STRING)) def with_hive_conf(self, name, value): """ Adds hive's configuration to Hive Job :param name: name of the given configuration :param value: value of the given configuration :type name: str :type value: str :rtype: """ key = TaskOptions.CONF_KEY_HIVE_CONFIG self.__set_config(key, name, value) return self def __set_config(self, key, name, value): """ Configuration method for add parameter :type key: :type name: :type value: """ if name: if self.has_option(key): value = "{0}\n{1}={2}".format(self._config.get(self.name, key), name, value) else: value = "{0}={1}".format(name, value) self._config.set(self.name, key, value) def add_hivevar(self, name, value): """ Sets hive's variable to job's context :param name: name of the given variable :param value: value of the given variable :type name: str :type value: str :rtype: Hive """ key = TaskOptions.CONF_KEY_HIVE_VAR self.__set_config(key, name, value) return self def define_variable(self, name, value): """ Sets hive's variable to job's context :param name: name of the given variable :param value: value of the given variable :type name: str :type value: str :rtype: Hive """ key = TaskOptions.CONF_KEY_DEFINE self.__set_config(key, name, value) return self def use_database(self, database): """ Sets database to job's context :param database: name of the custom database :type database: str :rtype: Hive """ key = TaskOptions.CONF_KEY_DATABASE self.__set_config(key, None, database) return self def with_auxillary_jars(self, jars): """ Sets the path to jar that contain implementations of user defined functions and serdes :param jars: paths to jar :type jars: list, str :rtype: Hive """ if isinstance(jars, list): jars = ",".join(jars) key = TaskOptions.CONF_KEY_AUXPATH self.__set_config(key, None, jars) return self def run(self): """ Runs Hive Job :rtype: """ Hive.LOG.info("Executing Hive Job") result = self.__executor("hive", self.build()) result.if_failed_raise(HiveCommandError("Hive Job failed")) return result def build(self): """ Builds query params for hive's query :return: list of query params :rtype: list """ params = [] if self.has_option(TaskOptions.CONF_KEY_AUXPATH): params.append("--auxpath {0}".format( self._config.get(self.name, TaskOptions.CONF_KEY_AUXPATH))) params.extend(self._configure_command_()) if self.has_option(TaskOptions.CONF_KEY_DEFINE): list_ = self._config.get_list(self.name, TaskOptions.CONF_KEY_DEFINE) for value in list_: params.append("--define") params.append(value) if self.has_option(TaskOptions.CONF_KEY_HIVE_CONFIG): list_ = self._config.get_list(self.name, TaskOptions.CONF_KEY_HIVE_CONFIG) for value in list_: params.append("--hiveconf") params.append(value) if self.has_option(TaskOptions.CONF_KEY_HIVE_VAR): list_ = self._config.get_list(self.name, TaskOptions.CONF_KEY_HIVE_VAR) for value in list_: params.append("--hivevar") params.append(value) if self.has_option(TaskOptions.CONF_KEY_DATABASE): params.append("--database {0}".format( self._config.get(self.name, TaskOptions.CONF_KEY_DATABASE))) return " ".join(params) def has_option(self, key): """ Checks if attribute at the given key exists in job specific section of the Configuration. :param key: attribute name :return: True in case attribute was found otherwise False """ return self._config.has(section=self.name, key=key)