示例#1
0
def get_webserver_url():
    return "{}:{}".format(conf.get('webserver', 'WEB_SERVER_HOST'), conf.get('webserver', 'WEB_SERVER_PORT'))
示例#2
0
 def config():
     conf.get('cwl', 'jobs')
     conf.get('cwl', 'limit')
示例#3
0
 def general_paths():
     paths([conf.get('cwl', 'jobs'), DAGS_FOLDER, os.path.join(DAGS_FOLDER, "cwl_dag.py")])
    'owner': 'xingya-zhou',
    'depends_on_past': False,
    'start_date': datetime.datetime.now(),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'catchup': False,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('sparkify_dag',
          default_args=default_args,
          start_date=datetime.datetime.now())

f = open(os.path.join(conf.get('core', 'dags_folder'), 'create_tables.sql'))
create_tables_sql = f.read()

create_trips_table = PostgresOperator(task_id="create_trips_table",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=create_tables_sql)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
示例#5
0
def create_folders():
    logging.info("Create folders for jobs and dags\n- {}\n- {}".format(conf.get('cwl', 'jobs'), DAGS_FOLDER))
    get_folder(conf.get('cwl', 'jobs'))
    get_folder(DAGS_FOLDER)
示例#6
0
def worker(args):
    """Starts Airflow Celery worker"""
    env = os.environ.copy()
    env['AIRFLOW_HOME'] = settings.AIRFLOW_HOME

    if not settings.validate_session():
        log = LoggingMixin().log
        log.error("Worker exiting... database connection precheck failed! ")
        sys.exit(1)

    # Celery worker
    from airflow.executors.celery_executor import app as celery_app
    from celery.bin import worker  # pylint: disable=redefined-outer-name

    autoscale = args.autoscale
    if autoscale is None and conf.has_option("celery", "worker_autoscale"):
        autoscale = conf.get("celery", "worker_autoscale")
    worker = worker.worker(app=celery_app)  # pylint: disable=redefined-outer-name
    options = {
        'optimization': 'fair',
        'O': 'fair',
        'queues': args.queues,
        'concurrency': args.concurrency,
        'autoscale': autoscale,
        'hostname': args.celery_hostname,
        'loglevel': conf.get('core', 'LOGGING_LEVEL'),
    }

    if conf.has_option("celery", "pool"):
        options["pool"] = conf.get("celery", "pool")

    if args.daemon:
        pid, stdout, stderr, log_file = setup_locations(
            "worker", args.pid, args.stdout, args.stderr, args.log_file)
        handle = setup_logging(log_file)
        stdout = open(stdout, 'w+')
        stderr = open(stderr, 'w+')

        ctx = daemon.DaemonContext(
            pidfile=TimeoutPIDLockFile(pid, -1),
            files_preserve=[handle],
            stdout=stdout,
            stderr=stderr,
        )
        with ctx:
            sub_proc = subprocess.Popen(['airflow', 'serve_logs'],
                                        env=env,
                                        close_fds=True)
            worker.run(**options)
            sub_proc.kill()

        stdout.close()
        stderr.close()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)

        sub_proc = subprocess.Popen(['airflow', 'serve_logs'],
                                    env=env,
                                    close_fds=True)

        worker.run(**options)
        sub_proc.kill()
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Airflow logging settings"""

import os
from typing import Any, Dict, Union

from airflow import AirflowException, conf
from airflow.utils.file import mkdirs

# TODO: Logging format and level should be configured
# in this file instead of from airflow.cfg. Currently
# there are other log format and level configurations in
# settings.py and cli.py. Please see AIRFLOW-1455.
LOG_LEVEL: str = conf.get('logging', 'LOGGING_LEVEL').upper()

# Flask appbuilder's info level log is very verbose,
# so it's set to 'WARN' by default.
FAB_LOG_LEVEL: str = conf.get('logging', 'FAB_LOGGING_LEVEL').upper()

LOG_FORMAT: str = conf.get('logging', 'LOG_FORMAT')

COLORED_LOG_FORMAT: str = conf.get('logging', 'COLORED_LOG_FORMAT')

COLORED_LOG: bool = conf.getboolean('logging', 'COLORED_CONSOLE_LOG')

COLORED_FORMATTER_CLASS: str = conf.get('logging', 'COLORED_FORMATTER_CLASS')

BASE_LOG_FOLDER: str = conf.get('logging', 'BASE_LOG_FOLDER')
示例#8
0
""" Module for Airtunnel's paths, i.e. to the declaration, data and scripts store folders. """
from os import path

from airflow import conf

P_DECLARATIONS = conf.get(section="airtunnel", key="declarations_folder")
P_DATA = conf.get(section="airtunnel", key="data_store_folder")
P_SCRIPTS = conf.get(section="airtunnel", key="scripts_folder")
P_SCRIPTS_SQL = path.join(P_SCRIPTS, "sql")
P_SCRIPTS_PY = path.join(P_SCRIPTS, "py")

# define data paths based on data store root:
P_DATA_ARCHIVE = path.join(P_DATA, "archive")
P_DATA_INGEST = path.join(P_DATA, "ingest")
P_DATA_READY = path.join(P_DATA, "ready")
P_DATA_STAGING = path.join(P_DATA, "staging")
P_DATA_STAGING_PICKEDUP = path.join(P_DATA_STAGING, "pickedup")
P_DATA_STAGING_READY = path.join(P_DATA_STAGING, "ready")
P_DATA_STAGING_INTERMEDIATE = path.join(P_DATA_STAGING, "intermediate")
P_DATA_INGEST_LANDING = path.join(P_DATA_INGEST, "landing")
P_DATA_INGEST_ARCHIVE = path.join(P_DATA_INGEST, "archive")
示例#9
0
def webserver(args):
    """Starts Airflow Webserver"""
    print(settings.HEADER)

    access_logfile = args.access_logfile or conf.get('webserver',
                                                     'access_logfile')
    error_logfile = args.error_logfile or conf.get('webserver',
                                                   'error_logfile')
    num_workers = args.workers or conf.get('webserver', 'workers')
    worker_timeout = (args.worker_timeout
                      or conf.get('webserver', 'web_server_worker_timeout'))
    ssl_cert = args.ssl_cert or conf.get('webserver', 'web_server_ssl_cert')
    ssl_key = args.ssl_key or conf.get('webserver', 'web_server_ssl_key')
    if not ssl_cert and ssl_key:
        raise AirflowException(
            'An SSL certificate must also be provided for use with ' + ssl_key)
    if ssl_cert and not ssl_key:
        raise AirflowException(
            'An SSL key must also be provided for use with ' + ssl_cert)

    if args.debug:
        print("Starting the web server on port {0} and host {1}.".format(
            args.port, args.hostname))
        app, _ = create_app(None,
                            testing=conf.getboolean('core', 'unit_test_mode'))
        app.run(debug=True,
                use_reloader=not app.config['TESTING'],
                port=args.port,
                host=args.hostname,
                ssl_context=(ssl_cert,
                             ssl_key) if ssl_cert and ssl_key else None)
    else:
        os.environ['SKIP_DAGS_PARSING'] = 'True'
        app = cached_app(None)
        pid, stdout, stderr, log_file = setup_locations(
            "webserver", args.pid, args.stdout, args.stderr, args.log_file)
        os.environ.pop('SKIP_DAGS_PARSING')
        if args.daemon:
            handle = setup_logging(log_file)
            stdout = open(stdout, 'w+')
            stderr = open(stderr, 'w+')

        print(
            textwrap.dedent('''\
                Running the Gunicorn Server with:
                Workers: {num_workers} {workerclass}
                Host: {hostname}:{port}
                Timeout: {worker_timeout}
                Logfiles: {access_logfile} {error_logfile}
                =================================================================\
            '''.format(num_workers=num_workers,
                       workerclass=args.workerclass,
                       hostname=args.hostname,
                       port=args.port,
                       worker_timeout=worker_timeout,
                       access_logfile=access_logfile,
                       error_logfile=error_logfile)))

        run_args = [
            'gunicorn',
            '-w',
            str(num_workers),
            '-k',
            str(args.workerclass),
            '-t',
            str(worker_timeout),
            '-b',
            args.hostname + ':' + str(args.port),
            '-n',
            'airflow-webserver',
            '-p',
            str(pid),
            '-c',
            'python:airflow.www.gunicorn_config',
        ]

        if args.access_logfile:
            run_args += ['--access-logfile', str(args.access_logfile)]

        if args.error_logfile:
            run_args += ['--error-logfile', str(args.error_logfile)]

        if args.daemon:
            run_args += ['-D']

        if ssl_cert:
            run_args += ['--certfile', ssl_cert, '--keyfile', ssl_key]

        webserver_module = 'www'
        run_args += ["airflow." + webserver_module + ".app:cached_app()"]

        gunicorn_master_proc = None

        def kill_proc(dummy_signum, dummy_frame):  # pylint: disable=unused-argument
            gunicorn_master_proc.terminate()
            gunicorn_master_proc.wait()
            sys.exit(0)

        def monitor_gunicorn(gunicorn_master_proc):
            # These run forever until SIG{INT, TERM, KILL, ...} signal is sent
            if conf.getint('webserver', 'worker_refresh_interval') > 0:
                master_timeout = conf.getint('webserver',
                                             'web_server_master_timeout')
                restart_workers(gunicorn_master_proc, num_workers,
                                master_timeout)
            else:
                while gunicorn_master_proc.poll() is None:
                    time.sleep(1)

                sys.exit(gunicorn_master_proc.returncode)

        if args.daemon:
            base, ext = os.path.splitext(pid)
            ctx = daemon.DaemonContext(
                pidfile=TimeoutPIDLockFile(base + "-monitor" + ext, -1),
                files_preserve=[handle],
                stdout=stdout,
                stderr=stderr,
                signal_map={
                    signal.SIGINT: kill_proc,
                    signal.SIGTERM: kill_proc
                },
            )
            with ctx:
                subprocess.Popen(run_args, close_fds=True)

                # Reading pid file directly, since Popen#pid doesn't
                # seem to return the right value with DaemonContext.
                while True:
                    try:
                        with open(pid) as file:
                            gunicorn_master_proc_pid = int(file.read())
                            break
                    except OSError:
                        LOG.debug(
                            "Waiting for gunicorn's pid file to be created.")
                        time.sleep(0.1)

                gunicorn_master_proc = psutil.Process(gunicorn_master_proc_pid)
                monitor_gunicorn(gunicorn_master_proc)

            stdout.close()
            stderr.close()
        else:
            gunicorn_master_proc = subprocess.Popen(run_args, close_fds=True)

            signal.signal(signal.SIGINT, kill_proc)
            signal.signal(signal.SIGTERM, kill_proc)

            monitor_gunicorn(gunicorn_master_proc)
示例#10
0
class DeploymentView(BaseView):
    plugins_folder = conf.get("core", "plugins_folder")
    template_folder = os.path.join(plugins_folder, "deploy-plugin")
    repo = git.Repo(conf.get("core", "dags_folder"))
    route_base = "/deployment"

    def render(self, template, **context):
        return render_template(
            template,
            base_template=self.appbuilder.base_template,
            appbuilder=self.appbuilder,
            **context,
        )

    @expose("/status")
    @has_access
    @action_logging
    def list(self):
        title = "Deployment"
        data = dict()
        remotes = list()

        for rem in self.repo.remotes:
            remotes.append((rem.name, rem.url))
            try:
                rem.fetch(prune=True)
            except GitCommandError as gexc:
                flash(str(gexc), "error")

        data["remotes"] = remotes
        data["active_branch"] = self.repo.active_branch.name
        data["sha"] = self.repo.head.object.hexsha
        data["commit_message"] = self.repo.head.object.message
        data["author"] = self.repo.head.object.author
        data["committed_date"] = datetime.fromtimestamp(
            self.repo.head.object.committed_date).strftime("%Y-%m-%d %H:%M:%S")
        data["local_branches"] = [brn.name for brn in self.repo.branches]
        remote_branches = [
            ref.name for ref in self.repo.remotes.origin.refs
            if "HEAD" not in ref.name
        ]

        form = GitBranchForm()
        form.git_branches.choices = [(brn, brn) for brn in remote_branches]

        return self.render_template("deploy.html",
                                    title=title,
                                    form=form,
                                    data=data)

    @expose("/deploy", methods=["POST"])
    @has_access
    @action_logging
    def deploy(self):

        new_branch = request.form.get("git_branches")
        new_local_branch = new_branch.replace("origin/", "")

        try:
            self.repo.git.checkout(new_local_branch)
            self.repo.git.pull()
            if new_local_branch == self.repo.active_branch.name:
                flash(f"Successfully updated branch: {new_local_branch}")
            else:
                flash(f"Successfully changed to branch: {new_local_branch}")
        except GitCommandError as gexc:
            flash(str(gexc), "error")
        return redirect("/deployment/status")
class DAGDependenciesView(BaseView):
    dagbag = None
    plugins_folder = conf.get("core", "plugins_folder")
    template_folder = os.path.join(plugins_folder, "dag-dependencies-plugin")
    route_base = "/"
    refresh_interval = conf.getint(
        "dag_dependencies_plugin", "refresh_interval", fallback=300
    )
    last_refresh = datetime(2000, 1, 1)
    nodes = []
    edges = []

    def render(self, template, **context):
        return render_template(
            template,
            base_template=self.appbuilder.base_template,
            appbuilder=self.appbuilder,
            **context,
        )

    @expose("/dag-dependencies")
    @has_access
    def list(self):
        title = "DAG Dependencies"

        if DAGDependenciesView.dagbag is None:
            DAGDependenciesView.dagbag = models.DagBag(settings.DAGS_FOLDER)

        if datetime.utcnow() > self.last_refresh + timedelta(
            seconds=self.refresh_interval
        ):
            DAGDependenciesView.dagbag.collect_dags()
            self.nodes, self.edges = self._generate_graph()
            self.last_refresh = datetime.utcnow()

        return self.render_template(
            "dag_dependencies.html",
            title=title,
            nodes=self.nodes,
            edges=self.edges,
            last_refresh=self.last_refresh.strftime("%Y-%m-%d %H:%M:%S"),
            arrange=conf.get("webserver", "dag_orientation"),
            width=request.args.get("width", "100%"),
            height=request.args.get("height", "800"),
        )

    @staticmethod
    def _generate_graph():
        nodes = {}
        edges = []

        for dag_id, dag in DAGDependenciesView.dagbag.dags.items():
            dag_node_id = "d--" + dag_id
            nodes[dag_node_id] = DAGDependenciesView._node_dict(
                dag_node_id, dag_id, "fill: rgb(232, 247, 228)"
            )

            for task in dag.tasks:
                task_node_id = "t--" + dag_id + "--" + task.task_id
                if isinstance(task, TriggerDagRunOperator):
                    nodes[task_node_id] = DAGDependenciesView._node_dict(
                        task_node_id, task.task_id, "fill: rgb(255, 239, 235)"
                    )

                    edges.append({"u": dag_node_id, "v": task_node_id})
                    edges.append({"u": task_node_id, "v": "d--" + task.trigger_dag_id})
                elif isinstance(task, ExternalTaskSensor):
                    nodes[task_node_id] = DAGDependenciesView._node_dict(
                        task_node_id, task.task_id, "fill: rgb(230, 241, 242)"
                    )

                    edges.append({"u": task_node_id, "v": dag_node_id})
                    edges.append({"u": "d--" + task.external_dag_id, "v": task_node_id})

            implicit = getattr(dag, "implicit_dependencies", None)
            if isinstance(implicit, list):
                for dep in implicit:
                    dep_node_id = "i--" + dag_id + "--" + dep
                    nodes[dep_node_id] = DAGDependenciesView._node_dict(
                        dep_node_id, "implicit", "fill: gold"
                    )

                    edges.append({"u": dep_node_id, "v": dag_node_id})
                    edges.append({"u": "d--" + dep, "v": dep_node_id})

        return list(nodes.values()), edges

    @staticmethod
    def _node_dict(node_id, label, style):
        return {
            "id": node_id,
            "value": {"label": label, "style": style, "rx": 5, "ry": 5},
        }
示例#12
0
 def serve_logs(filename):  # pylint: disable=unused-variable, redefined-outer-name
     log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
     return flask.send_from_directory(log,
                                      filename,
                                      mimetype="application/json",
                                      as_attachment=False)