Exemplo n.º 1
0
def azkaban_mongo_job():

    project_2 = Project('LeadDataStats')
    project_2.add_job('UploadDataStats', Job({'type': 'command', 'command': 'python /home/msingh/Documents/'
                    'PycharmProjects/AzkabanTest/mongo/MongoDataPush.py'}, {'dependencies': 'MongoStart'}))

    project_3 = Project('MongoDataUpload')
    project_3.add_job('UploadDataStatus', Job({'type': 'command', 'command': 'echo "Data successfully  uploaded"'},
                                              {'dependencies': 'UploadDataStats'}))
Exemplo n.º 2
0
def build_project(project_name, global_props, project_props, jobs, files,
                  version):
    logger.info("Building workflow %s, version: %s.", project_name, version)

    project = Project(project_name, root=os.curdir, version=version)
    project.properties = global_props
    project.properties.update(project_props)

    for job_name, job_definition in jobs.items():
        project.add_job(job_name, Job(job_definition))

    for file, target in files:
        project.add_file(file, target)
    return project
Exemplo n.º 3
0
    },
    'jvm.args.mapred': {
        'max.split.size': 2684354560,
        'min.split.size': 2684354560,
    },
}

# list of pig job options
OPTIONS = [
    {
        'pig.script': 'first.pig'
    },
    {
        'pig.script': 'second.pig',
        'dependencies': 'first.pig'
    },
    {
        'pig.script': 'third.pig',
        'param': {
            'foo': 48
        }
    },
    {
        'pig.script': 'fourth.pig',
        'dependencies': 'second.pig,third.pig'
    },
]

for option in OPTIONS:
    PROJECT.add_job(option['pig.script'], PigJob(DEFAULTS, option))
Exemplo n.º 4
0
#!/usr/bin/env python
# encoding: utf-8

"""Simple Azkaban project configuration script."""

from azkaban import Job, Project

project = Project('foo')
project.add_job('bar', Job({'type': 'command', 'command': 'echo "hi!"'}))

if __name__ == '__main__':
  project.main()
Exemplo n.º 5
0
  'basic_flow':        Job({'type': 'noop'   , 'dependencies': 'basic_step_5.cmd,basic_step_6.cmd,basic_step_7.cmd,basic_step_8.cmd'}),
  # `template_flow` example
  #   • Demonstrates using one flow as a "template" that is embedded in another flow and reused multiple times.
  #   • The only work performed by job in this example template is to echo out the variables it receives to the log.
  #     NOTE: We have to `chmod 777` our script to make sure Azkaban can run it.
  '_template_chmod.cmd':  Job({'type': 'command', 'command': 'chmod 777 _echo.sh'}),
  '_template_echo_1.cmd': Job({'type': 'command', 'command': './_echo.sh "echo_1" ${project_1} ${custom_1} ${custom_2}', 'dependencies': '_template_chmod.cmd'}),
  '_template_echo_2.cmd': Job({'type': 'command', 'command': './_echo.sh "echo_2" ${project_1} ${custom_1} ${custom_2}', 'dependencies': '_template_echo_1.cmd'}),
  '_template':            Job({'type': 'noop'   , 'dependencies': '_template_echo_2.cmd'}),
  #   • Each of the following subflows embeds *ALL* of the steps from `_template` using the `flow.name` key.
  #   • Each defines `custom_1` and `custom_2` keys which are passed as variables ${custom_1} and ${custom_2} to `_template` during execution.
  'start.noop':           Job({'type': 'noop'}),
  'subflow_1.flow':       Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow1-val1', 'custom_2': 'subflow1-val2'}),
  'subflow_2.flow':       Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow2-val1', 'custom_2': 'subflow2-val2'}),
  'subflow_3.flow':       Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow3-val1', 'custom_2': 'subflow3-val2'}),
  'subflow_4.flow':       Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow4-val1', 'custom_2': 'subflow4-val2'}),
  'workflow':             Job({'type': 'noop', 'dependencies': 'subflow_1.flow,subflow_2.flow,subflow_3.flow,subflow_4.flow'})
}

for name, job in JOBS.items():
  PROJECT.add_job(name, job)

# The CLI requires any non-job files to be explicitly included. 
# Must declare the `root` in the project in order for this to work. 
FILES = {
  './_echo.sh': '_echo.sh'
}

for file, name in FILES.items():
  PROJECT.add_file(file, name)
Exemplo n.º 6
0
PROJECT = Project('azkabancli_sample', root=__file__)
PROJECT.properties = {
    'user.to.proxy': 'production_user',
    'hdfs.root': '/jobs/sample/'
}

# dictionary of jobs, keyed by job name
JOBS = {
    'gather_data':
    Job({
        'type': 'hadoopJava',
        'job.class': 'sample.GatherData',
        'path.output': '${hdfs.root}data.avro',  # note the property use here
    }),

    # ...
}

for name, job in JOBS.items():
    PROJECT.add_job(name, job)

# Test project
# ------------
#
# This project is an exact copy of the production project which can be used
# to debug / test new features independently from the production flows.

TEST_PROJECT = Project('sample_test', root=__file__)
TEST_PROJECT.properties = {'user.to.proxy': getuser(), 'hdfs.root': 'sample/'}
PROJECT.merge_into(TEST_PROJECT)
Exemplo n.º 7
0
from azkaban import Job, Project

project = Project('foo')
project.add_file('./jobs.py', 'jobs.py')
project.add_job('bar', Job({'type': 'command', 'command': 'cat jobs.py'}))
Exemplo n.º 8
0

project = Project('foo', root=__file__)

defaults = {
  'user.to.proxy': getuser(),
  'mapred': {
    'max.split.size': 2684354560,
    'min.split.size': 2684354560,
  },
}

project.add_job(
  'first_pig_script',
  PigJob(
    'path/to/first_script.pig', # assume it exists
    defaults,
  )
)

project.add_job(
  'second_pig_script',
  PigJob(
    'path/to/second_script.pig', # assume it also exists
    defaults,
    {'mapred.job.queue.name': 'special'},
  )
)

project.add_job(
  'final_job',
Exemplo n.º 9
0
from azkaban import PigJob, Project
from getpass import getuser


PROJECT = Project('azkabancli_sample', root=__file__)

# default options for all jobs
DEFAULTS = {
  'user.to.proxy': getuser(),
  'param': {
    'input_root': 'sample_dir/',
    'n_reducers': 20,
  },
  'jvm.args.mapred': {
    'max.split.size': 2684354560,
    'min.split.size': 2684354560,
  },
}

# list of pig job options
OPTIONS = [
  {'pig.script': 'first.pig'},
  {'pig.script': 'second.pig', 'dependencies': 'first.pig'},
  {'pig.script': 'third.pig', 'param': {'foo': 48}},
  {'pig.script': 'fourth.pig', 'dependencies': 'second.pig,third.pig'},
]

for option in OPTIONS:
  PROJECT.add_job(option['pig.script'], PigJob(DEFAULTS, option))