示例#1
0
class Flume(object):
    """
    Wrapper for Flume command line utility
    """

    LOG = get_logger("Flume")

    @staticmethod
    def agent(agent=None, conf_file=None, executor=execute_shell_command):
        """
        Creates wrapper for 'flume-ng agent' command line commands
        :param agent: name of the agent
        :type agent: str
        :param conf_file: path to config file
        :type conf_file: str
        :rtype: FlumeAgent
        """
        return FlumeAgent(agent=agent, conf_file=conf_file, executor=executor)
示例#2
0
class WebHCatalog():
    """
    WebHCatalog Client
    """
    URL = "http://{host}/templeton/v1/ddl/database/{database}/table/{table}/property/{property}?user.name={username}"
    DATA = "{{ \"value\": \"{value}\" }}"
    HEADERS = {'Content-type': 'application/json'}

    LOG = get_logger("WebHCatalog")

    def __init__(self, username, host="localhost", port=None):
        self.host = "{0}:{1}".format(host, port) if port else host
        self.username = username

    def table_properties(self, table, database="default"):
        """
        Returns TableProperties object
        """
        return TableProperties(database=database, table=table, webhcat=self)
示例#3
0
 def __init__(self, name='LoggingListener'):
     self.log = get_logger(name)
示例#4
0
class SparkApplication(object):
    """
    Wrapper for spark-submit command line utility.
    Provides simple DSL to configure and launch Spark Application
    """
    LOG = get_logger("Spark")
    SHELL_COMMAND = "spark-submit"

    def __init__(self, config=None, name=None, executor=execute_shell_command):
        """

        :param config: configurations
        :param name: name of the config section containing specific application configurations
        :param executor: he interface used by the client to launch Spark Application.
        """
        super(SparkApplication, self).__init__()
        self.executor = executor
        self._configs = config if config else Configuration.create()
        self.name = name if name \
            else "SPARK_JOB_{0}".format(uuid.uuid4())

    @staticmethod
    def load_preconfigured_job(name=None,
                               config=None,
                               executor=execute_shell_command):
        """
        Creates wrapper for spark-submit command line utility. Configure it with options
        :param config: spark job configurations
        :param name: spark job identifier.
             Will be used as a name of the section with job-specific configurations.
        :param executor:
        :return:
        """
        SparkApplication.LOG.info("Loading Spark Job from configuration")

        return SparkApplication(name=name, config=config, executor=executor)

    def run(self, *args):
        """
        Submits spark application.
        :param args: Arguments passed to the main method of your main class, if any
        """

        return self._fire_job(verbose=False, args=args)

    def _fire_job(self, verbose=False, args=None):
        _options = []
        _options.extend(self._configure_spark_options())
        if verbose:
            _options.append("--verbose")
        _options.append(
            self._configs.require(
                self.name, TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR))
        if args:
            _options.extend(str(arg) for arg in args)

        return SparkJobStatus(self.executor(self.SHELL_COMMAND, *_options))

    def debug(self, *args):
        """
        Submits spark application in a verbose mode.
        :param args: Arguments passed to the main method of your main class, if any
        """
        return self._fire_job(verbose=True, args=args)

    def master(self, master):
        """
        Sets the cluster manager
        :param master: master URL of the clusters
        :return:
        """
        self._configs.set(section=self.name,
                          key=TaskOptions.SPARK_APP_CONFIG_MASTER,
                          value=master)
        return self

    def application(self, application_jar, main_class=None, app_name=None):
        """
        Configures Spark application
        :param application_jar: Path to a bundled jar including your application and all dependencies
        :param main_class: Java or Scala classname. Application entry point
        :param app_name:
        :return:
        """
        self.application_jar(application_jar)
        self.main_class(main_class)
        self.application_name(app_name)
        return self

    def config_file(self, path):
        """
        Configures Spark app to load default properties from file.
        :param path: Path to a file from which to load extra properties.
        If not specified, this will look for conf/spark-defaults.conf.
        :return:
        """
        self._configs.set(section=self.name,
                          key=TaskOptions.SPARK_APP_CONFIG_PROPERTIES_FILE,
                          value=path)
        return self

    def with_config_option(self, key, value):
        """
        Supplies configuration values at runtime.
        According to specification,
        configuration values explicitly set on a SparkConf take the highest precedence,
        then flags passed to spark-submit, then values in the defaults file.
        :param key: option name.
            see https://spark.apache.org/docs/latest/configuration.html for supported properties
        :param value:  option value
        :return:
        """
        _key_value_pair = "{key}={value}".format(key=key, value=value)

        self._configs.update_list(self.name,
                                  TaskOptions.SPARK_APP_CONFIG_OPTIONS,
                                  _key_value_pair)

        return self

    def main_class(self, main_class):
        """
        Sets spark application's main class (for Java / Scala apps).
        :param main_class:
        :return:
        """
        if main_class:
            self._configs.set(section=self.name,
                              key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS,
                              value=main_class)
        return self

    def application_jar(self, app_jar):
        """
        Sets path to a bundled jar including application with all dependencies.
        :param app_jar: path to application jar.
        :return:
        """
        if app_jar:
            self._configs.set(section=self.name,
                              key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR,
                              value=app_jar)
        return self

    def application_name(self, name):
        """
        Sets application's name. This will appear in the UI and in log data.
        :param name:  A name of Spark application.
        :return:
        """
        if name:
            self._configs.set(section=self.name,
                              key=TaskOptions.SPARK_APP_CONFIG_APP_NAME,
                              value=name)
        return self

    def classpath(self, *jar_files):
        """
        Specifies the list of local jars to include on the driver and executor classpaths.
        :param jar_files: jar files to be included to application classpath
        :return:
        """
        self._configs.update_list(self.name, TaskOptions.SPARK_APP_CONFIG_JARS,
                                  *jar_files)
        return self

    def pythonpath(self, *pyfiles):
        """
        Specifies the list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps.
        :param pyfiles:
        :return:
        """
        self._configs.update_list(self.name,
                                  TaskOptions.SPARK_APP_CONFIG_PYFILES,
                                  *pyfiles)
        return self

    def add_files(self, *files):
        """
        Adds files to be placed in the working directory of each executor
        :param files:
        :return:
        """
        self._configs.update_list(self.name,
                                  TaskOptions.SPARK_APP_CONFIG_FILES, *files)
        return self

    def _configure_spark_options(self):
        """
           Adds next args to command :
           --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
           --deploy-mode DEPLOY_MODE   Where to run the driver program: either "client" to run
                                      on the local machine, or "cluster" to run inside cluster.
           --class CLASS_NAME          Your application's main class (for Java / Scala apps).
           --name NAME                 A name of your application.
           --jars JARS                 Comma-separated list of local jars to include on the driver
                                      and executor classpaths.
           --py-files PY_FILES         Comma-separated list of .zip, .egg, or .py files to place
                                      on the PYTHONPATH for Python apps.
           --files FILES               Comma-separated list of files to be placed in the working
                                      directory of each executor.
           --properties-file FILE      Path to a file from which to load extra properties. If not
                                      specified, this will look for conf/spark-defaults.conf.
          --conf PROP=VALUE           Arbitrary Spark configuration property.


        :return:
        """
        _options = []
        _section = self.name
        if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_MASTER):
            _options.extend([
                "--master",
                self._configs.get(_section,
                                  TaskOptions.SPARK_APP_CONFIG_MASTER)
            ])

        if self._configs.has(_section,
                             TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS):
            _options.extend([
                "--class",
                self._configs.get(_section,
                                  TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS)
            ])

        if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_APP_NAME):
            _options.extend([
                "--name",
                self._configs.get(_section,
                                  TaskOptions.SPARK_APP_CONFIG_APP_NAME)
            ])

        if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_JARS):
            _options.extend([
                "--jars", ",".join(
                    self._configs.get_list(_section,
                                           TaskOptions.SPARK_APP_CONFIG_JARS))
            ])

        if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_PYFILES):
            _options.extend([
                "--py-files", ",".join(
                    self._configs.get_list(
                        _section, TaskOptions.SPARK_APP_CONFIG_PYFILES))
            ])

        if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_FILES):
            _options.extend([
                "--files", ",".join(
                    self._configs.get_list(_section,
                                           TaskOptions.SPARK_APP_CONFIG_FILES))
            ])

        if self._configs.has(_section,
                             TaskOptions.SPARK_APP_CONFIG_PROPERTIES_FILE):
            _options.extend([
                "--properties-file",
                self._configs.get(_section,
                                  TaskOptions.SPARK_APP_CONFIG_PROPERTIES_FILE)
            ])

        if self._configs.has(_section, TaskOptions.SPARK_APP_CONFIG_OPTIONS):
            _options.extend([
                "--conf", "\"{0}\"".format(" ".join(
                    self._configs.get_list(
                        _section, TaskOptions.SPARK_APP_CONFIG_OPTIONS)))
            ])

        return _options
示例#5
0
        self.log = get_logger(name)
        self.metrics = {}

    def on_begin(self, action_name):
        if 'error' not in action_name:
            if os.path.isfile('resources/step'):
                os.remove('resources/step')

    def on_error(self, action_name, exception):
        file = open('resources/step', 'w')
        file.write(action_name)
        file.close()


if __name__ == '__main__':
    log = get_logger("SCD")

    # Prepare paths
    _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig')
    _scd_active_snapshot = '/tmp/scd.active/scd.active.csv'
    _scd_updates = os.path.join(os.path.dirname(__file__), 'resources',
                                'scd.update.csv')
    _hdfs_job_output = '/tmp/scd.updated'

    _local_folder_to_monitor = LocalFS(
        os.path.join(os.path.dirname(__file__), 'resources'))
    _hdfs_basedir = HDFS('/tmp/scd.active')
    _hdfs_tmpdir = HDFS('/tmp/scd.tmp')
    _hdfs_tmpdir.create_directory()

    if _scd_updates and LocalFS(_scd_updates).exists():
示例#6
0
文件: flow.py 项目: Mbaroudi/Merlin
 def __init__(self, name='WorkflowFailOverController'):
     self.log = get_logger(name)
     self.metrics = {}
示例#7
0
文件: flow.py 项目: epam/Merlin
 def __init__(self, name):
     super(Workflow, self).__init__()
     self.name = name
     self.__action_registry__ = {}
     self.log = get_logger(self.name)
示例#8
0
文件: flow.py 项目: epam/Merlin
        self.log = get_logger(name)
        self.metrics = {}

    def on_begin(self, action_name):
        if 'error' not in action_name:
            if os.path.isfile('resources/step'):
                os.remove('resources/step')

    def on_error(self, action_name, exception):
        file = open('resources/step', 'w')
        file.write(action_name)
        file.close()


if __name__ == '__main__':
    log = get_logger("SCD")

    # Prepare paths
    _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig')
    _scd_active_snapshot = '/tmp/scd.active/scd.active.csv'
    _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv')
    _hdfs_job_output = '/tmp/scd.updated'

    _local_folder_to_monitor = LocalFS(os.path.join(os.path.dirname(__file__), 'resources'))
    _hdfs_basedir = HDFS('/tmp/scd.active')
    _hdfs_tmpdir = HDFS('/tmp/scd.tmp')
    _hdfs_tmpdir.create_directory()

    if _scd_updates and LocalFS(_scd_updates).exists():

        # Checks if file with last failed step is exists
示例#9
0
class MapReduce(object):
    LOG = get_logger("MapReduce")

    def __init__(self,
                 name,
                 config,
                 executable,
                 executor,
                 main_class=None,
                 shell_command="hadoop jar"):
        self.executor = executor
        self.executable = executable
        self._config = config if config else Configuration.create(
            readonly=False, accepts_nulls=True)
        self.name = name if name else "MR_TASK_{0}".format(uuid.uuid4())
        self.main_class = main_class
        self._shell_command = shell_command
        self._process = None

    @staticmethod
    def prepare_streaming_job(config=None,
                              name=None,
                              jar="hadoop-streaming.jar",
                              executor=execute_shell_command):
        """
        Creates instance of StreamingJob
        :param name: name of job
        :param jar: executing jar
        :param executor: interface used by the client to run command.
        :return: StreamingJob template
        :rtype : StreamingJob
        """
        MapReduce.LOG.info("MapReduce streaming job")
        config = config if config else Configuration.create(readonly=False,
                                                            accepts_nulls=True)
        MapReduce.__validate_configs(config, name, "StreamingJob",
                                     TaskOptions.KEYS_FOR_MAPREDUCE)
        return StreamingJob(
            config=config,
            name=name if name else "MR_STREAMING_JOB_{0}".format(uuid.uuid4()),
            jar=jar,
            executor=executor)

    @staticmethod
    def prepare_mapreduce_job(jar,
                              main_class=None,
                              config=None,
                              name=None,
                              executor=execute_shell_command):
        """
        Creates instance of MapReduceJob
        :param name: name of job
        :param jar: executing jar
        :param executor: interface used by the client to run command.
        :return: MapReduceJob template
        :rtype : MapReduceJob
        """
        MapReduce.LOG.info("MapReduce job")
        config = config if config else Configuration.create(readonly=False,
                                                            accepts_nulls=True)
        MapReduce.__validate_configs(config, name, "MapReduceJob",
                                     TaskOptions.KEYS_FOR_STREAMING_JOB)
        return MapReduceJob(
            name=name if name else "MR_JOB_{0}".format(uuid.uuid4()),
            config=config,
            jar=jar,
            main_class=main_class,
            executor=executor)

    @staticmethod
    def __validate_configs(config, name, type_of_job, keys):
        """
        Logs warning for set incorrect keys in .INI file for custom job
        """
        for key in keys:
            if config.has(name, key):
                MapReduce.LOG.warning("{0} does not use this key: {1}.".format(
                    type_of_job, key))

    def run(self, *args):
        """
        Runs specific MapReduce Job
        :param args: specific argument to CLI for MapReduceJob
        :rtype:
        """
        if args:
            if isinstance(self, StreamingJob):
                MapReduce.LOG.warning("StreamingJob does not use args.")
            else:
                self._update_list_config_(TaskOptions.COMMAND_ARGS, *args)
        command, arguments = self.__configure_command__()
        self._process = self.executor(command, *arguments)
        return self._process

    def status(self):
        """
        Returns status of finished job
        :return:
        """
        return None if self._process.is_running() \
            else JobStatus(JobStatus.job_id(self._process.stderr))

    def with_number_of_reducers(self, reducer_num):
        """
        Streaming MR job has it's own command parameter to set number of reducers.
        Overrides base method to ignore 'mapreduce.job.reduces' configuration option
        :param reducer_num:
        :return:
        """
        return self.with_config_option(
            TaskOptions.CONFIG_KEY_MR_JOB_REDUCER_NUM, reducer_num)

    def disable_reducers(self):
        return self.with_number_of_reducers(0)

    def __configure_command__(self):
        """Overrides this method to configure MR job"""
        if not os.path.isfile(self.executable):
            raise MapReduceConfigurationError("{0} doesn't exist".format(
                self.executable))
        arguments = [self.executable]
        if self.main_class is not None:
            arguments.append(self.main_class)
        arguments.extend(self._generic_options_())
        arguments.extend(self._command_options_())
        return self._shell_command, arguments

    def load_configuration_from(self, _file):
        """
        Specifies an application configuration file.
        :param _file:
        """
        self._config[TaskOptions.CONFIG_KEY_MR_JOB_CONF_FILE] = _file
        return self

    def use_jobtracker(self, jobtracker):
        """
        Specifies an application jobtracker.
        :param jobtracker:
        """
        self._config[TaskOptions.CONFIG_KEY_MR_JOBTRACKER] = jobtracker
        return self

    def _generic_options_(self):
        """
        Adds generic option to hadoop command.
        -conf <configuration file>
        -D <property>=<value>
        -jt <local> or <jobtracker:port> Specify a job tracker.
        -files <comma separated list of files> Specify comma separated files to be copied
        to the map reduce cluster.
        -libjars <comma separated list of jars> Specify comma separated jar files
        to include in the classpath.
        -archives <comma separated list of archives> Specify comma separated archives
        to be unarchived on the compute machines.

        Applications should implement Tool to support GenericOptions.

        :return:
        """
        options = []
        # Specify an application configuration file
        if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CONF_FILE):
            options.extend(
                ['--conf',
                 self.get(TaskOptions.CONFIG_KEY_MR_JOB_CONF_FILE)])

        # Add or override MR job options
        options.extend(
            ['-D', '='.join([TaskOptions.CONFIG_KEY_MR_JOB_NAME, self.name])])
        if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CONF_OPTION):
            options.extend("-D {0}".format(att) for att in self.get_list(
                TaskOptions.CONFIG_KEY_MR_JOB_CONF_OPTION))

        # Specify an application jobtracker
        if self.has_option(TaskOptions.CONFIG_KEY_MR_JOBTRACKER):
            options.extend(
                ['--jt',
                 self.get(TaskOptions.CONFIG_KEY_MR_JOBTRACKER)])

        # comma separated files to be copied to the map reduce cluster
        if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_FILE):
            options.extend([
                '-files', ",".join(
                    self.get_list(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_FILE))
            ])

        # comma separated jar files to include in the classpath
        if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_LIBJARS):
            options.extend([
                '-libjars',
                ",".join(self.get_list(TaskOptions.CONFIG_KEY_MR_JOB_LIBJARS))
            ])

        # comma separated archives to be unarchived on the compute machines
        if self.has_option(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_ARCHIVE):
            options.extend([
                '-archives', ",".join(
                    self.get_list(TaskOptions.CONFIG_KEY_MR_JOB_CACHE_ARCHIVE))
            ])

        return options

    def _command_options_(self):
        options = []
        return options

    def _update_list_config_(self, _key, *values):
        _inputs = self._config.get(self.name, _key) if self._config.has(self.name, _key) \
            else []
        _inputs.extend(values)
        return self._update_config_option_(_key, _inputs)

    def _update_config_option_(self, key, value):
        self._config.set(self.name, key, value)
        return self

    def with_config_option(self, key, value):
        """
        Adds or updates job configuration variable.
        In case java-based MR job options will be passed into configuration object
        :param key: variable name
        :param value: variable value
        :return:
        """
        return self._update_list_config_(
            TaskOptions.CONFIG_KEY_MR_JOB_CONF_OPTION,
            "{0}={1}".format(key, value))

    def use_jars(self, *libs):
        """
        Adds jar files to be included in the classpath
        :param libs: jar that should be placed in distributed cache
        and will be made available to all of the job's task attempts.
        :return:
        """
        return self._update_list_config_(TaskOptions.CONFIG_KEY_MR_JOB_LIBJARS,
                                         *libs)

    def cache_files(self, *files):
        """
        Adds files which will be copied to the Map/Reduce cluster
        :param files:  The list of files that need to be added to distributed cache
        :return:
        """
        return self._update_list_config_(
            TaskOptions.CONFIG_KEY_MR_JOB_CACHE_FILE, *files)

    def cache_archives(self, *archives):
        """
        Adds archives which will be copied to the Map/Reduce cluster
        :param files:  The list of archives that need to be added to distributed cache
        :return:
        """
        return self._update_list_config_(
            TaskOptions.CONFIG_KEY_MR_JOB_CACHE_ARCHIVE, *archives)

    def has_option(self, key):
        """
        Checks if job configuration contains specified option
        :param key: option name
        :return: True option value
        """
        return self._config.has(self.name, key)

    def get(self, key, required=False):
        """
        Gets the value of the specified configuration option.
        :param key: option name
        :param required: Boolean flag, True is option is required
        :return: option value or None in case option was not found and option is not required.
        ConfigurationError will be thrown in case required option was not found within current
        configuration
        """
        return self._config.require(self.name, key) if required \
            else self._config.get(self.name, key)

    def get_list(self, key, required=False):
        """
        Gets the value of the specified configuration option property as a list

        :param key: option name
        :param required: True is option is required
        :return: property value as a list of strings or None in case option was not found
         and option is not required.
        ConfigurationError will be thrown in case required option was not found within current
        configuration
        """
        return self._config.require_list(section=self.name,
                                         key=key) if required \
            else self._config.get_list(section=self.name,
                                       key=key)
示例#10
0
Standard scenario of ETL process.

Flow imports data from mysql to HDFS, process it and upload processed data back to mysql.
"""

import os
from merlin.common.exceptions import MapReduceJobException
from merlin.common.logger import get_logger
from merlin.flow.flow import FlowRegistry, Workflow
from merlin.flow.listeners import WorkflowListener, LoggingListener
from merlin.tools.mapreduce import MapReduce
from merlin.tools.sqoop import Sqoop


BASE_DIR = "/tmp"
LOG = get_logger("SimpleETLFlow")


# Imports data from mysql's table 'test_example.first_table_name'(id,name,count)
# to HDFS's folder '/tmp/data_from_import' in "'id','name','count'" format.
@Workflow.action(flow_name='Flow',
                 action_name='Sqoop import etl step',
                 on_success='MapReduce job etl step',
                 on_error='error')
def load_data_from_rdbms_to_hdfs(context):
    # configure Sqoop import job
    _sqoop_import_job_ = Sqoop.import_data().from_rdbms(
        host="127.0.0.1",
        rdbms="mysql",
        database="test_example",
        username="******",
示例#11
0
文件: flow.py 项目: epam/Merlin
Compares them and get only new files on FTP that don't exist on HDFS.
Download new files to HDFS with partition.
"""
from ConfigParser import RawConfigParser
import os
from merlin.common.logger import get_logger
from merlin.flow.flow import Workflow, FlowRegistry
from merlin.flow.listeners import LoggingListener, WorkflowListener
from merlin.fs.ftp import ftp_client
from merlin.fs.hdfs import HDFS
from merlin.fs.localfs import LocalFS
from merlin.fs.utils import FileUtils
from merlin.tools.hive import Hive

BASE_DIR = "/tmp/base_folder"
log = get_logger("MonitoringFTP")

config = RawConfigParser()
config.read(os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini"))
HOST_DOWNLOAD = config.get("ftp", "host.download")
USER_NAME = config.get("ftp", "user.name")
PASSWORD = config.get("ftp", "password")
PATH = config.get("ftp", "path")


def get_name(path):
    if not hasattr(path, 'name'):
        raise TypeError('FileDescriptor is required. '
                        'Cannot extract file name from {0}'.format(path.__class__))

    return path.name.split("/")[-1]
示例#12
0
class Kafka():
    """
    Wrapper for Kafka command line scripts
    """
    RUN_SHELL = "kafka-run-class.sh"

    LOG = get_logger("Kafka")

    @staticmethod
    def run_consumer(name,
                     args,
                     executor=execute_shell_command,
                     kafka_run_class=RUN_SHELL):
        """
        Runs specific consumer. Executing command:
        kafka-run-class.sh {name} {configs}
        :param name:
        :param args:
        :param executor:
        :return:
        """
        command = "{0} {1}".format(kafka_run_class, name)
        if args:
            command += Kafka.__get_configs(args)
        return Kafka.__run(executor, command)

    @staticmethod
    def __get_configs(configs):
        command = ""
        if isinstance(configs, list):
            for value in configs:
                command += " {0}".format(value)
        elif isinstance(configs, str):
            command += " {0}".format(configs)
        elif isinstance(configs, dict):
            for key, value in configs.iteritems():
                command += " {0} {1}".format(key, value)
        return command

    @staticmethod
    def __get_configs_topics(configs):
        command = ""
        if isinstance(configs, list):
            for value in configs:
                command += " --config {0}".format(value)
        elif isinstance(configs, str):
            for value in configs.split(","):
                command += " --config {0}".format(value)
        elif isinstance(configs, dict):
            for key, value in configs.iteritems():
                command += " --config {0}={1}".format(key, value)
        return command

    @staticmethod
    def run_producer(name,
                     args,
                     executor=execute_shell_command,
                     kafka_run_class=RUN_SHELL):
        """
        Runs specific producer. Executing command:
        kafka-run-class.sh {name} {configs}
        :param name:
        :param args:
        :param executor:
        :return:
        """
        command = "{0} {1}".format(kafka_run_class, name)
        if args:
            command += Kafka.__get_configs(args)
        return Kafka.__run(executor, command)

    @staticmethod
    def start_broker(path_to_config,
                     executor=execute_shell_command,
                     kafka_run_class="kafka-server-start.sh"):
        """
        Runs broker using configuration file. Executing command:
        kafka-server-start.sh {path_to_config}
        :param path_to_config:
        :param executor:
        :return:
        """
        command = "{0} {1}".format(kafka_run_class, path_to_config)
        return Kafka.__run(executor, command)

    @staticmethod
    def stop_broker(path_to_config,
                    executor=execute_shell_command,
                    kafka_run_class="kafka-server-stop.sh"):
        """
        Runs broker using configuration file. Executing command:
        kafka-server-stop.sh {path_to_config}
        :param path_to_config:
        :param executor:
        :return:
        """
        command = "{0} {1}".format(kafka_run_class, path_to_config)
        return Kafka.__run(executor, command)

    @staticmethod
    def create_topic(name,
                     replication_factor=None,
                     replica_assignment=None,
                     partitions=1,
                     zookeeper_host=None,
                     args=None,
                     executor=execute_shell_command,
                     kafka_run_class=RUN_SHELL):
        """
        Creates topic
        :param name:
        :param replication_factor:
        :param replica_assignment:
        :param partitions:
        :param zookeeper_host:
        :param args:
        :param executor:
        :return:
        """
        command = "{0} kafka.admin.TopicCommand --create --zookeeper {2} --topic {1} --partitions {3}" \
            .format(kafka_run_class, name, zookeeper_host, partitions)
        if replication_factor:
            command += " --replication-factor {0}".format(replication_factor)
        if replica_assignment:
            command += " --replication-assignment {0}".format(
                replica_assignment)
        if args:
            command += Kafka.__get_configs_topics(args)
        Kafka.__run(executor, command)
        return Topic(name, zookeeper_host, executor)

    @staticmethod
    def get_list_topics(zookeeper_host=None,
                        executor=execute_shell_command,
                        kafka_run_class=RUN_SHELL):
        """
        Returns existing list of topics on zookeeper
        :param zookeeper_host:
        :param executor:
        :return:
        """
        command = "{0} kafka.admin.TopicCommand --zookeeper {1} --list" \
            .format(kafka_run_class, zookeeper_host)
        topics = []
        for t in Kafka.__run(executor, command).stdout.split('\n'):
            topics.append(Topic(t, zookeeper_host))
        return topics

    @staticmethod
    def __run(executor, command):
        Kafka.LOG.info("Executing Kafka command: {0}".format(command))
        return executor(command)
示例#13
0
class Topic():
    LOG = get_logger("Topic")

    def __init__(self, name, zookeeper_host, executor=execute_shell_command):
        """
        Wrapper for Kafka command #./bin/kafka-run-class.sh kafka.admin.TopicCommand
        :param name:
        :param zookeeper_host:
        :param executor:
        :return:
        """
        self.name = name
        self.zookeeper_host = zookeeper_host
        self._executor = executor

    def get_metadata(self, kafka_run_class=Kafka.RUN_SHELL):
        """
        Returns metadata of topic. Executing command:
        #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --describe --zookeeper {host:port}
        :return:
        """
        return self.__run("--topic {0} --describe".format(self.name),
                          kafka_run_class).stdout

    def add_config(self, key, value, kafka_run_class=Kafka.RUN_SHELL):
        """
        Adds config to topic. Executing command:
        #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --alter
        --zookeeper {host:port} config {key=value}
        :param key:
        :param value:
        :return:
        """

        return self.__run(
            "--topic {0} --alter config {1}={2}".format(self.name, key, value),
            kafka_run_class)

    def delete_config(self, key, value, kafka_run_class=Kafka.RUN_SHELL):
        """
        Deletes config from topic. Executing command:
        #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --alter
        --zookeeper {host:port} deleteConfig {key=value}
        :param key:
        :param value:
        :return:
        """

        return self.__run(
            "--topic {0} --alter deleteConfig {1}={2}".format(
                self.name, key, value), kafka_run_class)

    def delete(self, kafka_run_class=Kafka.RUN_SHELL):
        """
        Deletes topic. Executing command:
        #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --delete
        --zookeeper {host:port}
        :return:
        """

        return self.__run("--topic {0} --delete".format(self.name),
                          kafka_run_class)

    def is_exists(self, kafka_run_class=Kafka.RUN_SHELL):
        """
        Returns True if topic exist else - False. Executing command:
        #./bin/kafka-run-class.sh kafka.admin.TopicCommand --topic {name} --list --zookeeper {host:port}
        :return:
        """
        result = self.__run("--list", kafka_run_class)
        topics = result.stdout.split('\n')
        return str(self.name) in topics

    def __run(self, command, kafka_run_class):
        Topic.LOG.info("Executing Topic command")
        result = self._executor(
            "{0} kafka.admin.TopicCommand".format(kafka_run_class),
            "--zookeeper", self.zookeeper_host, command)
        return result
示例#14
0
 def __init__(self):
     super(FsSnapshot, self).__init__()
     self.logger = get_logger(self.__class__.__name__)
     self.files = {}
示例#15
0
 def __init__(self, name='ProfilingListener'):
     self.log = get_logger(name)
     self.metrics = {}
示例#16
0
class DistCp(object):
    """Hadoop's command distcp utilities."""
    LOG = get_logger("DistCP")

    def __init__(self, executor=execute_shell_command):
        """
        Creates a new DistCp instance
        :param executor: command executor
        :type executor:
        :rtype : DistCp
        """
        self.preserve = "-p"
        self.strategy = None
        self.mappers = None
        self.synchronize = False
        self.path_src = None
        self.path_dest = None
        self.__executor = executor

    def run(self):
        """
        Runs DistCp Job
        :rtype: Result
        """
        DistCp.LOG.info("Running DistCp Job")
        _process = self.__executor('hadoop distcp', self.build())
        _process.if_failed_raise(DistCpError("DistCp Job failed"))
        return _process

    def build(self):
        """
        Builds DistCp command
        :rtype: str
        """

        list_attributes = [self.preserve]
        if self.mappers:
            list_attributes.append(self.mappers)
        if self.strategy:
            list_attributes.append(self.strategy)
        if self.synchronize:
            list_attributes.append("-delete")
        if self.path_src:
            list_attributes.append(self.path_src)
        else:
            raise DistCpError("You must specify source that will be copied")
        if self.path_dest:
            list_attributes.append(self.path_dest)
        else:
            raise DistCpError(
                "You must specify destination where will saved file")

        return " ".join(list_attributes)

    def take(self, path):
        """
        Specifies the directory or file on file system which will be copied.
        Exception will be raised in case the directory or file isn't exists on file system
        :param path: path to source which should be copied
        :type path: str
        :rtype: DistCp
        """
        self.path_src = path

        return self

    def copy_to(self, path):
        """
        Specifies the directory or file on file system into which the data should be copied.
        :param path: path to destination into which the data should be copied
        :type path: str
        :rtype: DistCp
        """
        self.path_dest = path

        return self

    def use(self, mappers=None):
        """
        Specifies number of mappers
        :param mappers: number of map tasks
        :type mappers: str, int
        :rtype: DistCp
        """
        self.mappers = "-m {0}".format(str(mappers))

        return self

    def update_destination(self, synchronize=False):
        """
        Changes command strategy to update
        :param synchronize: synchronizes source with destination if param is True
        :type synchronize: bool
        :rtype: DistCp
        """
        self.strategy = "-update"
        if synchronize:
            self.synchronize = True

        return self

    def overwrite_destination(self, synchronize=False):
        """
        Changes command strategy to overwrite
        :param synchronize: synchronizes source with destination if param is True
        :type synchronize: bool
        :rtype: DistCp
        """
        self.strategy = "-overwrite"
        if synchronize:
            self.synchronize = True

        return self

    def preserve_replication_number(self):
        """
        Sets replication number of file in destination equals
        to replication number of file in source
        :rtype: DistCp
        """

        self.__set_preserve('r')

        return self

    def preserve_block_size(self):
        """
        Sets block size of file in destination equals
        to block size of file in source
        :rtype: DistCp
        """
        self.__set_preserve('b')

        return self

    def preserve_user(self):
        """
        Sets user of file in destination equals
        to user of file in source
        :rtype: DistCp
        """
        self.__set_preserve('u')

        return self

    def preserve_permission(self):
        """
        Sets permission of file in destination equals
        to permission of file in source
        :rtype: DistCp
        """
        self.__set_preserve('p')

        return self

    def preserve_group(self):
        """
        Sets group of file in destination equals
        to group of file in source
        :rtype: DistCp
        """
        self.__set_preserve('g')

        return self

    def preserve_checksum_type(self):
        """
        Sets checksum type of file in destination equals
        to checksum type of file in source
        :rtype: DistCp
        """
        self.__set_preserve('c')

        return self

    def __set_preserve(self, value):
        """
        :return:
        """

        if value not in self.preserve:
            self.preserve = "{0}{1}".format(self.preserve, value)
示例#17
0
文件: flow.py 项目: Mbaroudi/Merlin
 def __init__(self, name):
     super(Workflow, self).__init__()
     self.name = name
     self.__action_registry__ = {}
     self.log = get_logger(self.name)
示例#18
0
class JobStatus(object):
    """
    Describes the current status of a job.
    """

    COUNTER_SECTION = 'COUNTER'
    LOG = get_logger("MapReduceJobStatus")
    CLI_COMMAND = 'hadoop job'

    def __init__(self, job_id, executor=execute_shell_command):
        super(JobStatus, self).__init__()
        self.job_id = job_id
        self._executor = executor
        self.job_stats = None

    def state(self):
        """

        Returns the current state of the Job.
        :return: string value for job state.
        Possible values : FAILED, KILLED, PREP, RUNNING, SUCCEEDED
        """
        return self.stats()['Job state']

    @staticmethod
    def job_id(stderr):
        """
        Parses MR job stderr to get job id.
        :return: job id
        """
        _job_id = None
        for line in stderr.splitlines():
            if 'Running job:' in line:
                _job_id = str(line).rsplit(':', 1)[1].strip()
                JobStatus.LOG.info("Job id : {0}".format(_job_id))
                break
        if not _job_id:
            JobStatus.LOG.info("Cannot get job id")
        return _job_id

    def counters(self):
        """
        Gets the counters for this job.

        :return: all job counters in format {counter_group :{counter_name : counter_value}}
        """
        return self.stats()[JobStatus.COUNTER_SECTION]

    def counter(self, group, counter):
        """
        Gets the value of the specific job counter.
        :param group:
        :param counter:
        :return: the value for the specific counter
        """
        _counters = self.counters()
        return int(
            _counters[group][counter]
        ) if group in _counters and counter in _counters[group] else None

    def stats(self):
        """
        Gets aggregate job statistics, which includes:
        - job id
        - job file
        - job tracking URL
        - number of maps/reduces
        - map()/reduce() completion
        - job state
        - reason for failture
        - job counters
        - etc
        :return: job details
        """
        if not self.job_stats:
            _result = self._executor(self.CLI_COMMAND, '-status', self.job_id)
            _result.if_failed_raise(
                CommandException("cannot get map reduce job status"))
            self._parse_stdout_(_result.stdout)
        return self.job_stats

    def is_failed(self):
        """
        Checks if the job failed.

        :return:
        """
        return 'FAILED' == self.state()

    def is_killed(self):
        """
        Checks if the job process was killed.

        :return:
        """
        return 'KILLED' == self.state()

    def is_succeeded(self):
        """
        Checks if the job completed successfully.

        :return:
        """
        return self.state() == 'SUCCEEDED'

    def is_running(self):
        """
        Checks if the job is finished or not.

        :return: True if the job has running or prep state
        """
        return self.state() in ['PREP', 'RUNNING']

    def failure_reason(self):
        """
        Gets any available info on the reason of failure of the job.

        :return:  diagnostic information on why a job might have failed.
        """
        return None if not self.is_failed() else self.stats(
        )['reason for failure']

    def _parse_stdout_(self, stream):
        """
        Parses hadoop jar -status <job_id> output stream to get job stats
        :param stream: stream containing job stats data
        :return: dictionary containing job stats
        """
        _counter_group = None
        _job_metrics = {JobStatus.COUNTER_SECTION: {}}
        for line in stream.splitlines():
            is_counter = re.match('\A\t\t\w', line)
            is_counter_header = not is_counter and re.match('\A\t\w', line)
            key_value = [
                part.strip()
                for part in line.split("=" if is_counter else ":", 1)
            ]
            if is_counter_header:
                _counter_group = line.strip()
            elif is_counter:
                if not _counter_group in _job_metrics[
                        JobStatus.COUNTER_SECTION]:
                    _job_metrics[
                        JobStatus.COUNTER_SECTION][_counter_group] = {}
                _job_metrics[JobStatus.COUNTER_SECTION][_counter_group][
                    key_value[0]] = key_value[1]
            elif len(key_value) > 1:
                _job_metrics[key_value[0]] = key_value[1]
        self.job_stats = _job_metrics
示例#19
0
class Pig(object):
    """ Wrapper for pig command line utility. Provides logic to configure and launch Pig scripts"""
    LOG = get_logger("Pig")

    @staticmethod
    def load_commands_from_file(path,
                                command_executor=execute_shell_command):
        """
        Creates an instance of Pig client.
        Configures Pig client to run commands from specified script file.
        :param path: path to the script to execute
        :param command_executor:  The interface used by the client to run command.

        :type path: str
        :rtype: Pig
        """
        Pig.LOG.info("Loading Pig script from file : {0}".format(path))
        _config = Configuration.create(readonly=False, accepts_nulls=True)
        _job_name = "PIG_TASK_{0}".format(uuid.uuid4())
        _pig = Pig(config=_config,
                   job_name=_job_name,
                   command_executor=command_executor)
        _pig.execute_script(path=path)
        return _pig

    @staticmethod
    def load_commands_from_string(commands,
                                  command_executor=execute_shell_command):
        """
         Creates an instance of Pig client.
         Configures Pig client to parse and run commands from string.
         :param commands: Commands to execute (within quotes)
         :param command_executor:  The interface used by the client to run command.

         :type commands: str
         :rtype: Pig
         """
        _config = Configuration.create(readonly=False, accepts_nulls=True)
        _job_name = "PIG_TASK_{0}".format(uuid.uuid4())
        _pig = Pig(config=_config,
                   job_name=_job_name,
                   command_executor=command_executor)
        _pig.execute_commands(commands=commands)
        return _pig

    @staticmethod
    def load_preconfigured_job(config, job_name, command_executor=execute_shell_command):
        """
        Creates a pre-configured instance of the Pig client.
        :param config: pig job configurations
        :param job_name: ig job identifier.
             Will be used as a name of the section with job-specific configurations.
        :param command_executor:
        :return:
        """
        return Pig(config=config,
                   job_name=job_name,
                   command_executor=command_executor)

    def __init__(self, config, job_name, command_executor):
        self._config = config
        self._job_name = job_name
        self._command_executor = command_executor

    def __add_config_option__(self, key, value):
        """Facade method used to add new options to job-specific section of the configuration"""
        self._config.set(section=self._job_name, key=key, value=value)

    def _has_config_option_(self, key):
        """Facade method used to check if option with specified name is exist in configuration"""
        return self._config.has(section=self._job_name, key=key)

    def _get_config_option_(self, key):
        """Facade method used to get option from configuration"""
        return self._config.get(
            section=self._job_name,
            key=key)

    def _wrap_with_quotes_(self, value):
        """Wraps string with quotes: single or double"""
        if not value or value[0] in ['"', "'"]:
            return value
        _template = "'{}'" if '"' in value else '"{}"'
        return _template.format(value)

    def execute_script(self, path):
        """
        Specifies file containing script to execute.
        :param path: Path to the script to execute
            Will be passed to command executor as a value of -file option
        :rtype: Pig
        """
        if path:
            self.__add_config_option__(TaskOptions.CONFIG_KEY_SCRIPT_FILE,
                                       path)
        return self

    def execute_commands(self, commands):
        """
        Specifies commands to execute
        :param commands: Commands to execute (within quotes)
        :rtype: Pig
        """
        if commands:
            self.__add_config_option__(TaskOptions.CONFIG_KEY_COMMANDS_STRING,
                                       commands)
            return self

    def _configure_command_(self):
        """Adds pig commands to cli call."""
        if self._has_config_option_(TaskOptions.CONFIG_KEY_SCRIPT_FILE):
            return ['-f', self._wrap_with_quotes_(
                self._get_config_option_(key=TaskOptions.CONFIG_KEY_SCRIPT_FILE)
            )]
        elif self._has_config_option_(TaskOptions.CONFIG_KEY_COMMANDS_STRING):
            return ['-e', self._wrap_with_quotes_(
                self._get_config_option_(key=TaskOptions.CONFIG_KEY_COMMANDS_STRING)
            )]
        else:
            raise PigCommandError(
                "Failed to configure command : one of {} or {} is required".format(
                    TaskOptions.CONFIG_KEY_SCRIPT_FILE,
                    TaskOptions.CONFIG_KEY_COMMANDS_STRING)
            )

    def _configure_pig_options_(self, verbose=False):
        """Parse job specific configurations and builds arguments to be passed to CLI call"""
        _options = []
        if verbose:
            _options.append('-verbose')
        _options.extend(self.__configure_logging__())
        self.__add_command_arg__("-param_file",
                                 TaskOptions.CONFIG_KEY_PARAMETER_FILE,
                                 _options)

        if self._has_config_option_(TaskOptions.CONFIG_KEY_PARAMETER_VALUE):
            _params = self._config.get_list(self._job_name,
                                            TaskOptions.CONFIG_KEY_PARAMETER_VALUE)
            if _params:
                _options.extend(["-param {}".format(param) for param in _params])

        self.__add_command_arg__("-propertyFile",
                                 TaskOptions.CONFIG_KEY_PROPERTIES_FILE,
                                 _options)
        if self._has_config_option_(TaskOptions.CONFIG_KEY_EXECTYPE):
            _options.append("-x {}".format(self._get_config_option_(TaskOptions.CONFIG_KEY_EXECTYPE)))
        _options.extend(self._disable_optimizations_())

        return _options

    def __configure_logging__(self):
        """add logging configurations to cli call"""
        _logging_options_ = []
        self.__add_command_arg__("-log4jconf",
                                 TaskOptions.CONFIG_KEY_LOG4J,
                                 _logging_options_)
        self.__add_command_arg__("-logfile",
                                 TaskOptions.CONFIG_KEY_LOG_FILE,
                                 _logging_options_)
        self.__add_command_marker_arg("-brief",
                                      TaskOptions.CONFIG_KEY_LOG_BRIEF,
                                      _logging_options_)
        self.__add_command_marker_arg("-warning",
                                      TaskOptions.CONFIG_KEY_LOG_WARNING,
                                      _logging_options_)
        self.__add_command_marker_arg("-debug",
                                      TaskOptions.CONFIG_KEY_LOG_DEBUG,
                                      _logging_options_)

        return _logging_options_

    def _disable_optimizations_(self):
        """add cli call args to disable Pig Job optimizations"""
        _optimizations = []
        _optimizations.extend(
            self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_SPLIT_FILTER)
        )
        _optimizations.extend(
            self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_PUSHUP_FILTER)
        )
        _optimizations.extend(
            self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_MERGE_FILTER)
        )
        _optimizations.extend(
            self._configure_optimization_rule(
                TaskOptions.CONFIG_KEY_DISABLE_PUSHDOWN_FOREACH_FLATTEN
            )
        )
        _optimizations.extend(
            self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_LIMIT_OPTIMIZER)
        )
        _optimizations.extend(
            self._configure_optimization_rule(
                TaskOptions.CONFIG_KEY_DISABLE_COLUMN_MAP_KEY_PRUNE
            )
        )
        _optimizations.extend(
            self._configure_optimization_rule(
                TaskOptions.CONFIG_KEY_DISABLE_ADD_FOREACH
            )
        )
        _optimizations.extend(
            self._configure_optimization_rule(
                TaskOptions.CONFIG_KEY_DISABLE_MERGE_FOREACH
            )
        )
        _optimizations.extend(
            self._configure_optimization_rule(
                TaskOptions.CONFIG_KEY_DISABLE_GROUPBY_CONST_PARALLEL_SETTER
            )
        )
        _optimizations.extend(
            self._configure_optimization_rule(TaskOptions.CONFIG_KEY_DISABLE_ALL)
        )
        if self.is_optimization_disabled(TaskOptions.CONFIG_KEY_DISABLE_MULTIQUERY):
            _optimizations.append('-no_multiquery')

        return _optimizations

    def _configure_optimization_rule(self, rule_name):
        """build cli parameter to disable specific optimization rule"""
        return ['-optimizer_off', rule_name] if self.is_optimization_disabled(rule_name) else []

    def __add_command_arg__(self, name, config_key, args=list()):
        """adds argument to cli call"""
        if self._has_config_option_(config_key):
            args.extend([name, self._get_config_option_(config_key)])

    def __add_command_marker_arg(self, name, config_key, args=list()):
        """adds marker argument (argument without value) to cli call"""
        if self._has_config_option_(config_key) and self._get_config_option_(config_key):
            args.append(name)

    def log_config(self,
                   logfile=None,
                   debug=False,
                   warning=False,
                   brief=False):
        """
        Adds and configures custom logger for Pig's script

        :param logfile: to file, that will have logs from Pig Job
        :param debug: Enables debug level. Default it is False
        :param warning: Enables warning level. Default it is False
            Also turns warning aggregation off
        :param brief: Enables Brief logging (no timestamps). Default it is False

        :type logfile: str
        :type debug: bool
        :type warning bool
        :type brief: bool
        :rtype: Pig
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_FILE, logfile)
        if debug:
            self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_DEBUG, "enabled")
        if warning:
            self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_WARNING, "enabled")
        if brief:
            self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG_BRIEF, "enabled")
        return self

    def log4j_config(self, path):
        """
        Specify Log4j configuration file, overrides log conf
        :param path: path to file with log4j parameters for Pig Job
        :return:
        """
        if path:
            self.__add_config_option__(TaskOptions.CONFIG_KEY_LOG4J, path)
        return self

    def with_parameter(self, key, value):
        """
        Sets parameter for Pig Job script in the next format:
        name=value
        :param key: key to parameter
        :param value: value of parameter
        :type key: str
        :type value: str
        :rtype: Pig
        """
        self._config.update_list(self._job_name,
                                 TaskOptions.CONFIG_KEY_PARAMETER_VALUE,
                                 "{}={}".format(key, value))
        return self

    def load_parameters_from_file(self, path):
        """
        Specifies file with parameters
        :param path: Path to the parameter file
        :return:
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_PARAMETER_FILE, path)
        return self

    def with_property_file(self, path):
        """
        Sets file with properties at the given path
        :param path: to file with properties for Pig Job
        :type path: str
        :rtype: Pig
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_PROPERTIES_FILE, path)
        return self

    def without_split_filter(self):
        """
        Job will run without optimization 'Split filter conditions'

        Optimization split filter condition to allow push filter more aggressively.
        e.g.:
            D = FILTER C BY a1>0 and b1>0;
        will be splitted into:
            X = FILTER C BY a1>0;
            D = FILTER X BY b1>0;
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_SPLIT_FILTER, 'disable')
        return self

    def without_pushup_filter(self):
        """
        Job will run without optimization 'Early Filters'
        The objective of this optimization rule is to push
        the FILTER operators up the data flow graph.
        As a result, the number of records that flow through the pipeline is reduced.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_PUSHUP_FILTER, 'disable')
        return self

    def without_merge_filter(self):
        """
        Job will run without optimization 'Merge filter conditions'
        This rule used to merge filter conditions after PushUpFilter
        rule to decrease the number of filter statements.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_MERGE_FILTER, 'disable')
        return self

    def without_push_down_foreach_flatten(self):
        """
        Job will run without optimization 'Join or explode as late as possible'
        The objective of this rule is to reduce the number of records that flow
        through the pipeline by moving FOREACH operators with a FLATTEN down the data flow graph.
        """
        self.__add_config_option__(
            TaskOptions.CONFIG_KEY_DISABLE_PUSHDOWN_FOREACH_FLATTEN,
            'disable'
        )
        return self

    def without_limit_optimizer(self):
        """
        Job will run without optimization 'Limit as early as possible'
        The objective of this rule is to push the LIMIT operator up the data flow graph.
        In addition, for top-k (ORDER BY followed by a LIMIT) the LIMIT
        is pushed into the ORDER BY.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_LIMIT_OPTIMIZER, 'disable')
        return self

    def without_column_map_key_prune(self):
        """
        Job will run without optimization 'Remove unused data'
        Prune the loader to only load necessary columns.
        The performance gain is more significant if the corresponding loader support column pruning
        and only load necessary columns.
        Otherwise, ColumnMapKeyPrune will insert a ForEach statement right after loader.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_COLUMN_MAP_KEY_PRUNE,
                                   'disable')
        return self

    def without_add_foreach(self):
        """
        Job will run without optimization 'Add ForEach to remove unneeded columns'
        Prune unused column as soon as possible.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_ADD_FOREACH,
                                   'disable')
        return self

    def without_merge_foreach(self):
        """
        Job will run without optimization 'Merge adjacent ForEach'
        The objective of this rule is to merge together two foreach statements,
        if these preconditions are met:

            - The foreach statements are consecutive.
            - The first foreach statement does not contain flatten.
            - The second foreach is not nested.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_MERGE_FOREACH,
                                   'disable')
        return self

    def without_groupby_const_parallel_setter(self):
        """
        Job will run without optimization 'Force parallel 1 for "group all" statement'
        Force parallel "1" for "group all" statement.
        That's because even if we set parallel to N, only 1 reducer
        will be used in this case and all other reducer produce empty result.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_GROUPBY_CONST_PARALLEL_SETTER,
                                   'disable')
        return self

    def without_multiquery(self):
        """
       Turns off multi query optimization.
       Default multi query optimization is turned on
       :rtype: Pig
       """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_MULTIQUERY,
                                   'disable')
        return self

    def disable_all_optimizations(self):
        """Disables all optimizations"""
        self.__add_config_option__(TaskOptions.CONFIG_KEY_DISABLE_ALL,
                                   'disable')
        return self

    def run(self, debug=False):
        """
        Runs Pig Job
        :rtype: Result
        """
        Pig.LOG.info("Running Pig Job")
        command_args = self._configure_pig_options_(debug) + self._configure_command_()
        return self._command_executor('pig', *command_args)

    def debug(self):
        """Runs Pig script in debug mode."""
        return self.run(debug=True)

    def is_optimization_disabled(self, optimization_config_key, disable_marker='disable'):
        """
        Checks is specified optimization is disabled.
        By default optimization, and all optimization rules, are turned on.
        :param optimization_config_key:
        :param disable_marker:
        :return:
        """
        return self._has_config_option_(optimization_config_key) \
               and disable_marker == self._get_config_option_(optimization_config_key)

    def using_mode(self, type="mapreduce"):
        """
        Sets execution mode, default is mapreduce.
        """
        self.__add_config_option__(TaskOptions.CONFIG_KEY_EXECTYPE,
                                   type)
        return self
示例#20
0
文件: flow.py 项目: epam/Merlin
 def __init__(self, name='WorkflowFailOverController'):
     self.log = get_logger(name)
     self.metrics = {}
示例#21
0
文件: listeners.py 项目: epam/Merlin
 def __init__(self, name='LoggingListener'):
     self.log = get_logger(name)
示例#22
0
文件: flow.py 项目: Mbaroudi/Merlin
Compares them and get only new files on FTP that don't exist on HDFS.
Download new files to HDFS with partition.
"""
from ConfigParser import RawConfigParser
import os
from merlin.common.logger import get_logger
from merlin.flow.flow import Workflow, FlowRegistry
from merlin.flow.listeners import LoggingListener, WorkflowListener
from merlin.fs.ftp import ftp_client
from merlin.fs.hdfs import HDFS
from merlin.fs.localfs import LocalFS
from merlin.fs.utils import FileUtils
from merlin.tools.hive import Hive

BASE_DIR = "/tmp/base_folder"
log = get_logger("MonitoringFTP")

config = RawConfigParser()
config.read(os.path.join(os.path.dirname(__file__),
                         "resources/ftp_config.ini"))
HOST_DOWNLOAD = config.get("ftp", "host.download")
USER_NAME = config.get("ftp", "user.name")
PASSWORD = config.get("ftp", "password")
PATH = config.get("ftp", "path")


def get_name(path):
    if not hasattr(path, 'name'):
        raise TypeError('FileDescriptor is required. '
                        'Cannot extract file name from {0}'.format(
                            path.__class__))
示例#23
0
文件: listeners.py 项目: epam/Merlin
 def __init__(self, name='ProfilingListener'):
     self.log = get_logger(name)
     self.metrics = {}
示例#24
0
文件: hive.py 项目: Mbaroudi/Merlin
class Hive(object):
    """
    Wrapper for Hive command line utility
    """

    LOG = get_logger('Hive')

    @staticmethod
    def load_queries_from_file(path, executor=execute_shell_command):
        """
        Creates wrapper for hive command line utility with execute query from file
        :param path: to file with query for Hive Job
        :type path: str
        :rtype: Hive
        """

        Hive.LOG.info("Loading Hive queries from file : {0}".format(path))
        hive = Hive(executor=executor)
        hive.execute_script(path)
        return hive

    @staticmethod
    def load_queries_from_string(query, executor=execute_shell_command):
        """
        Creates wrapper for hive command line utility with execute query from string
        :param query: HiveQL's query for executing
        :type query: str
        :rtype: Hive
        """

        Hive.LOG.info("Loading Hive queries from string : {0}".format(query))
        hive = Hive(executor=executor)
        hive.execute_commands(query)
        return hive

    @staticmethod
    def load_preconfigured_job(name=None,
                               config=None,
                               executor=execute_shell_command):
        """
        Creates wrapper for hive command line utility. Configure it with options
        :param config: hive job configurations
        :param name: hive job identifier.
             Will be used as a name of the section with job-specific configurations.
        :param executor:
        :return:
        """
        Hive.LOG.info("Loading Hive queries from configuration")

        return Hive(name=name, config=config, executor=executor)

    def __init__(self, name=None, config=None, executor=execute_shell_command):
        """
        Creates wrapper for Hive command line utility
        :param executor: custom executor
        :type executor:
        """

        super(Hive, self).__init__()
        self.name = name if name else "HIVE_TASK_{0}".format(uuid.uuid4())
        self.__executor = executor
        self._config = config if config else Configuration.create(
            readonly=False, accepts_nulls=True)

    def _wrap_with_quotes_(self, value):
        if not value or value[0] in ['"', "'"]:
            return value
        return "\"{0}\"".format(value)

    def execute_script(self, path):
        """
        Specifies file containing script to execute.
        :param path: Path to the script to execute
            Will be passed to command executor as a value of -f option
        :rtype: Hive
        """
        if path:
            self.__set_config(TaskOptions.CONFIG_KEY_QUERY_FILE, None, path)
        return self

    def execute_commands(self, commands):
        """
        Specifies commands to execute
        :param commands: Commands to execute (within quotes)
        :rtype: Hive
        """
        if commands:
            self.__set_config(TaskOptions.CONFIG_KEY_COMMANDS_STRING, None,
                              commands)
            return self

    def _configure_command_(self):
        if self.has_option(TaskOptions.CONFIG_KEY_QUERY_FILE):
            return [
                '-f',
                self._wrap_with_quotes_(
                    self._config.get(section=self.name,
                                     key=TaskOptions.CONFIG_KEY_QUERY_FILE))
            ]
        elif self.has_option(TaskOptions.CONFIG_KEY_COMMANDS_STRING):
            return [
                '-e',
                self._wrap_with_quotes_(
                    self._config.get(
                        section=self.name,
                        key=TaskOptions.CONFIG_KEY_COMMANDS_STRING))
            ]
        else:
            raise HiveCommandError(
                "Failed to configure command : one of {0} or {0} is required".
                format(TaskOptions.CONFIG_KEY_QUERY_FILE,
                       TaskOptions.CONFIG_KEY_COMMANDS_STRING))

    def with_hive_conf(self, name, value):
        """
        Adds hive's configuration to Hive Job
        :param name: name of the given configuration
        :param value: value of the given configuration
        :type name: str
        :type value: str
        :rtype:
        """

        key = TaskOptions.CONF_KEY_HIVE_CONFIG
        self.__set_config(key, name, value)
        return self

    def __set_config(self, key, name, value):
        """
        Configuration method for add parameter
        :type key:
        :type name:
        :type value:
        """
        if name:
            if self.has_option(key):
                value = "{0}\n{1}={2}".format(self._config.get(self.name, key),
                                              name, value)
            else:
                value = "{0}={1}".format(name, value)

        self._config.set(self.name, key, value)

    def add_hivevar(self, name, value):
        """
        Sets hive's variable to job's context
        :param name: name of the given variable
        :param value: value of the given variable
        :type name: str
        :type value: str
        :rtype: Hive
        """
        key = TaskOptions.CONF_KEY_HIVE_VAR
        self.__set_config(key, name, value)
        return self

    def define_variable(self, name, value):
        """
        Sets hive's variable to job's context
        :param name: name of the given variable
        :param value: value of the given variable
        :type name: str
        :type value: str
        :rtype: Hive
        """
        key = TaskOptions.CONF_KEY_DEFINE
        self.__set_config(key, name, value)
        return self

    def use_database(self, database):
        """
        Sets database to job's context
        :param database: name of the custom database
        :type database: str
        :rtype: Hive
        """
        key = TaskOptions.CONF_KEY_DATABASE
        self.__set_config(key, None, database)
        return self

    def with_auxillary_jars(self, jars):
        """
        Sets the path to jar that contain implementations of user defined functions and serdes
        :param jars: paths to jar
        :type jars: list, str
        :rtype: Hive
        """
        if isinstance(jars, list):
            jars = ",".join(jars)
        key = TaskOptions.CONF_KEY_AUXPATH
        self.__set_config(key, None, jars)
        return self

    def run(self):
        """
        Runs Hive Job
        :rtype:
        """
        Hive.LOG.info("Executing Hive Job")
        result = self.__executor("hive", self.build())
        result.if_failed_raise(HiveCommandError("Hive Job failed"))
        return result

    def build(self):
        """
        Builds query params for hive's query
        :return: list of query params
        :rtype: list
        """
        params = []
        if self.has_option(TaskOptions.CONF_KEY_AUXPATH):
            params.append("--auxpath {0}".format(
                self._config.get(self.name, TaskOptions.CONF_KEY_AUXPATH)))
        params.extend(self._configure_command_())
        if self.has_option(TaskOptions.CONF_KEY_DEFINE):
            list_ = self._config.get_list(self.name,
                                          TaskOptions.CONF_KEY_DEFINE)
            for value in list_:
                params.append("--define")
                params.append(value)
        if self.has_option(TaskOptions.CONF_KEY_HIVE_CONFIG):
            list_ = self._config.get_list(self.name,
                                          TaskOptions.CONF_KEY_HIVE_CONFIG)
            for value in list_:
                params.append("--hiveconf")
                params.append(value)
        if self.has_option(TaskOptions.CONF_KEY_HIVE_VAR):
            list_ = self._config.get_list(self.name,
                                          TaskOptions.CONF_KEY_HIVE_VAR)
            for value in list_:
                params.append("--hivevar")
                params.append(value)
        if self.has_option(TaskOptions.CONF_KEY_DATABASE):
            params.append("--database {0}".format(
                self._config.get(self.name, TaskOptions.CONF_KEY_DATABASE)))
        return " ".join(params)

    def has_option(self, key):
        """
        Checks if attribute at the given key exists in job specific section of the Configuration.
        :param key: attribute name
        :return:  True in case attribute was found otherwise False
        """
        return self._config.has(section=self.name, key=key)
示例#25
0
文件: bootstrap.py 项目: epam/Merlin
 def __init__(self):
     super(FsSnapshot, self).__init__()
     self.logger = get_logger(self.__class__.__name__)
     self.files = {}