예제 #1
0
def get_exported_table_df(table_name):
    """Retrieve exported table file on GCS.

    Args:
        table_name (string): Name of the table to load.

    Returns:
        pandas.DataFrame

    """

    bucket = storage\
        .Client(get_config('gcp_project_name'))\
        .get_bucket(get_config('gcs_bucket_name'))
    key = \
        '{experiment_name}/exported_tables/{table_name}/' \
        '{date_descriptor}/out.csv.gzip'.format(
            experiment_name=get_config('experiment_name'),
            table_name=table_name,
            date_descriptor='{{ ds_nodash }}')
    blob = storage.Blob(key, bucket)
    bio = io.BytesIO()
    blob.download_to_file(bio)
    bio.seek(0)

    return pd.read_csv(bio, compression='gzip')
예제 #2
0
def get_kubernetes_pod_operator(operator_name=None,
                                operator_image=None,
                                cmds=['python', 'main.py'],
                                env_vars=None,
                                dag=None,
                                image_tag=None):
    """Get templated KubernetesPodOperator.

    Intended to be used with your own implementations of kuberenetes pod operator
    bootstrapped using `new_pod_operator` command.

    Args:
        operator_name (string): Name of the operator. Defaults to None. e.g. `train-operator`
        operator_image (string): Name of the operator image. Defaults to None.
        Mutually exclusive to `operator_name` param.
        e.g. `gcr.io/my-project/my-experience_train-operator`
        cmds (list[str]): Command overrides for the pod.
        env_vars (dict): Env vars overrides for the pod.
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_kuberenetes_pod_operator(..., dag=dag)`. Defaults to None.

    Returns:
        airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator
    """

    if dag is None:
        logger.warning(
            'No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI'
        )

    if (operator_name is None and operator_image is None) or \
            (operator_name is not None and operator_image is not None):
        raise Exception(
            '''You need to specify either one of `opertor_name` or `operator_image` param'''
        )
    elif operator_name is not None:
        image = 'gcr.io/{gcp_project_name}/{experiment_name}_{operator_name}'\
            .format(
                gcp_project_name=get_config('gcp_project_name'),
                experiment_name=get_config('experiment_name'),
                operator_name=operator_name)
    elif operator_image is not None:
        image = operator_image

    image_tag = image_tag or 'LATEST'

    return KubernetesPodOperator(
        dag=dag or models._CONTEXT_MANAGER_DAG,
        task_id='{experiment_name}_{operator_name}'.format(
            experiment_name=get_config('experiment_name').replace('-', '_'),
            operator_name=operator_name.replace('-', '_')),
        name='{experiment_name}__{operator_name}:{image_tag}'.format(
            experiment_name=get_config('experiment_name').replace('_', '-'),
            operator_name=operator_name.replace('_', '-')),
        namespace='default',
        # Parameterize tags
        image=image,
        image_pull_policy='Always',
        cmds=cmds,
        env_vars=env_vars,
        startup_timeout_seconds=3600)
예제 #3
0
def get_maybe_create_dataset_operator(
        dag=None,
        table_expiration_seconds=2678400,  # 60 * 60 * 24 * 31
        partition_expiration_seconds=2678400):
    """Get templated BigQueryOperator.

    Args:
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_bq_to_bq_operator(..., dag=dag)`. Defaults to None.
        table_expiration_seconds (int): Default expiration time (in seconds) for tables in created datset.
        partition_expiration_seconds (int): Default expiration time (in seconds) for partitions in created datset.


    Returns:
        airflow.contrib.operators.bigquery_operator.BigQueryOperator

    """
    dag = dag or models._CONTEXT_MANAGER_DAG
    dataset_name = '{experiment_name}_database'.format(
        experiment_name=get_config('experiment_name'))

    return BigQueryMaybeCreateEmptyDatasetOperator(
        dag=dag,
        task_id='{experiment_name}.create_dataset'.format(
            experiment_name=get_config('experiment_name')),
        project_id=get_config('gcp_project_name'),
        dataset_id=dataset_name,
        dataset_reference={
            "description":
            "Dataset for experiment {experiment_name}."
            " Auto generated by fuga.",
            "defaultTableExpirationMs":
            str(table_expiration_seconds * 1000),
            "defaultPartitionExpirationMs":
            str(partition_expiration_seconds * 1000)
        })
예제 #4
0
파일: airflow.py 프로젝트: ayemos/fuga
def get_bq_to_bq_operator(
        sql_or_filename,
        dst_table_name,
        dag=None,
        params={},
        table_expiration_seconds=None,
        partition_expiration_seconds=None):
    """Get templated BigQueryOperator.

    Args:
        sql_or_filename (string): Valid SQL statement or a path to a sql file.
        It can be templated using Jinja in either case.
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_bq_to_bq_operator(..., dag=dag)`. Defaults to None.

    Returns:
        airflow.contrib.operators.bigquery_operator.BigQueryOperator

    """
    dag = dag or models._CONTEXT_MANAGER_DAG
    if dag is None:
        logger.warning('No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI')

    dst_table_name_with_date_descriptor = \
        '{table_name}{date_descriptor}'.format(
            table_name=dst_table_name,
            date_descriptor='{{ ds_nodash }}')

    dataset_name = '{experiment_name}_database'.format(
        experiment_name=get_config('experiment_name'))

    return BigQueryOperator(
        dag=dag,
        task_id='{experiment_name}.{table_name}.bq_to_bq'
        .format(
            experiment_name=get_config('experiment_name'),
            table_name=dst_table_name),
        sql=sql_or_filename,
        use_legacy_sql=False,
        write_disposition="WRITE_TRUNCATE",
        destination_dataset_table="{gcp_project_name}:{dataset_name}.{table_name}"
        .format(
            gcp_project_name=get_config('gcp_project_name'),
            dataset_name=dataset_name,
            table_name=dst_table_name_with_date_descriptor),

        params=params)
예제 #5
0
def get_export_table_operator(table_name, dag=None):
    """Get templated BigQueryToCloudStorageOperator.

    Args:
        table_name (string): Name of the table to export.
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_export_table_operator(..., dag=dag)`. Defaults to None.

    Returns:
        airflow.contrib.operators.bigquery_operator.BigQueryOperator

    """
    if dag is None:
        logger.warning(
            'No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI'
        )

    date_descriptor = '{{ ds_nodash }}'
    table_name_with_date_descriptor = \
        '{table_name}{date_descriptor}'.format(
            table_name=table_name,
            date_descriptor=date_descriptor)

    return BigQueryToCloudStorageOperator(
        dag=dag or models._CONTEXT_MANAGER_DAG,
        task_id='{experiment_name}.{table_name}.export'.format(
            experiment_name=get_config('experiment_name'),
            table_name=table_name),
        source_project_dataset_table=
        '{gcp_project_name}.{database_name}.{table_name}'.format(
            gcp_project_name=get_config('gcp_project_name'),
            database_name='%s_database' % get_config('experiment_name'),
            table_name=table_name_with_date_descriptor),
        # TODO: 1GB以上のデータに対応
        # https://cloud.google.com/bigquery/exporting-data-from-bigquery#exportingmultiple
        destination_cloud_storage_uris=[
            'gs://{bucket_name}/{experiment_name}/exported_tables/'
            '{table_name}/{date_descriptor}/'
            'out.csv.gzip'.format(
                bucket_name=get_config('bucket_name'),
                experiment_name=get_config('experiment_name'),
                date_descriptor=date_descriptor,
                table_name=table_name)
        ],
        compression="GZIP")
예제 #6
0
    def run(self):
        experiment_root_dir = find_experiment_root_dir()
        experiment = Experiment.from_path(experiment_root_dir)
        storage_client = storage.Client()
        composer_client = composer.Client()
        environment = composer_client.get_environment(
            get_config('composer_environment_full_path'))

        if environment.state != 'RUNNING':
            raise Exception(
                'Composer environment %s is in invalid state %s.\n'
                'You need to wait until the environment is running '
                'or fix it if it\'s broken.' % (
                    environment.name,
                    environment.state))

        if not environment.config.get('dagGcsPrefix', None):
            raise Exception(
                'Missing dagGcsPrefix config with environment %s.\n'
                'The environment may be in an invalid state or '
                'failed to launch.' % (
                    environment.name))

        bucket_url = urlparse(environment.config['dagGcsPrefix'])
        bucket_name = bucket_url.netloc
        bucket_prefix = bucket_url.path[1:]  # omit slash
        bucket = storage_client.get_bucket(bucket_name)

        pairs = []
        for target in ['py', 'sql', 'pod_operators']:
            for local_path in glob.iglob(
                    os.path.join(experiment_root_dir, target, '**/*'),
                    recursive=True):
                if self._is_ignored(local_path):
                    continue
                remote_path = os.path.join(
                    bucket_prefix,
                    experiment.name,
                    local_path[len(experiment_root_dir) + 1:])
                pairs.append((local_path, remote_path))

        click.echo('''
Following files are going to uploaded to GCS bucket %s
''' % bucket_name)
        click.echo(
            '\n'.join(
                '\t%s to %s'
                % (l, r) for l, r in pairs))

        click.echo('')
        click.confirm('Do you want to conitnue?', abort=True)

        for l, r in pairs:
            new_blob = bucket.blob(r)
            new_blob.upload_from_filename(l)
예제 #7
0
파일: environment.py 프로젝트: ayemos/fuga
    def run(self):
        config_overrides = {}
        if get_config('gcp_project_id'):
            resource_manager_client = resource_manager.Client()
            project = resource_manager_client.fetch_project(
                get_config('gcp_project_id'))
        else:
            project = self._setup_gcp_project()
            config_overrides['gcp_project_id'] = project.project_id

        if get_config('gcs_bucket_name'):
            storage_client = storage.Client(project=project.project_id)
            bucket = storage_client.get_bucket(get_config('gcs_bucket_name'))
        else:
            bucket = self._setup_gcs_bucket(project)
            config_overrides['gcs_bucket_name'] = bucket.name

        if get_config('composer_environment_full_path'):
            composer_client = composer.Client(project=project.project_id)
            environment = composer_client.get_environment(
                get_config('composer_environment_full_path'))
        else:
            environment = self._setup_composer_environment(
                project,
                location=bucket.location.lower())  # XXX: make it configurable
            config_overrides['composer_environment_full_path'] = \
                environment.full_path

        # Overwrite configurations
        for k, v in config_overrides.items():
            write_config(k, v)

        click.echo(
            'fuga environment is initialized. Now you can proceed to create '
            'environments by running `fuga experiment new`')
예제 #8
0
def find_or_clone_cookiecutter_template(
        template_name=_FUGA_DEFAULT_TEMPLATE_NAME):
    template = find_cookiecutter_template(template_name)
    if template is not None:
        return template

    click.confirm(
        'Could not find fuga experiment template with name %s.\n'
        'Do you want to clone it?',
        abort=True)

    local_git_dir = os.path.join(get_config('cookiecutters_dir'),
                                 template_name)
    remote_git_repo_name = f'git://github.com.org/{_FUGA_GITHUB_ORG_NAME}/{template_name}.git'
    git.Git(local_git_dir).clone(remote_git_repo_name)
예제 #9
0
def save_df(df, name):  # XXX: Function name!!
    """Save dataframe to GCS.

    Args:
        df (pandas.DataFrame): Dataframe to save.

    Returns:
        key (string): Key of dataframe blob saved to GCS.

    """
    bucket = storage.Client(get_config('gcp_project_name')) \
        .get_bucket(get_config('bucket_name'))
    key = '{experiment_name}/output/{date_descriptor}/{name}.csv' \
        .format(
            experiment_name=get_config('experiment_name'),
            date_descriptor='{{ ds_nodash }}',
            name=name)

    blob = bucket.blob(key)

    bio = io.BytesIO()
    df.to_csv(bio)
    blob.upload_from_file(bio, rewind=True)
    return key
예제 #10
0
파일: airflow.py 프로젝트: ayemos/fuga
def get_dag(start_date=None, schedule_interval=None, **xargs):
    default_args = {
        'start_date': start_date or dt.datetime.today(),
        'retries': 1,
        'email_on_failure': True}.items()}
    if models.Variable.get("notification_email_address", None) is not None:
        defaualt_args['email'] = models.Variable.get("notification_email_address")
    default_dag_args = dict(itertools.chain(
        default_args.items(),
        xargs.items()))

    return models.DAG(
        '{experiment_name}_dag'.format(
            experiment_name=get_config('experiment_name')),
        schedule_interval=schedule_interval or dt.timedelta(days=1),
        default_args=default_dag_args,
        catchup=False)
예제 #11
0
    def run(self,
            operator_name,
            dockerfile='./Dockerfile',
            image_name=None,
            version_tag=None,
            dryrun=False,
            remote_container_repo=None):
        import docker
        client = docker.from_env()
        experiment = Experiment.from_path(find_experiment_root_dir())

        build_path = os.path.join(experiment.root_path, OPERATOR_DIR_PREFIX,
                                  operator_name)
        if not os.path.isdir(build_path):
            raise Exception(f'{build_path} is not a directory')

        try:
            repo = Repo(find_experiment_root_dir())
            if len(repo.head.commit.diff(None)) > 0 \
                    or len(repo.untracked_files) > 0:
                import sys
                click.echo('Current Git working tree has either '
                           'untracked file or diff to HEAD. '
                           'Please commit your changes/new files before '
                           'any deployment.')
                sys.exit(0)
            version_hash = repo.head.commit.hexsha
        except InvalidGitRepositoryError:
            raise Exception(
                f'Experiment directory ({experiment.root_path}) needs to '
                'be a valid Git repository.\n'
                'Run `git init` in your experiment root')
        except ValueError as e:
            raise Exception(
                f'ValueError ({e}) has occured.\n'
                'Current Git repository might not have any commit.')

        image_name = image_name or f'{experiment.name}__{operator_name}'
        version_tag = version_tag or version_hash
        click.echo(f'Building docker image {image_name}:{version_tag}')
        image, _logs = client.images.build(path=build_path,
                                           dockerfile=dockerfile,
                                           tag=f'{image_name}:{version_tag}')
        click.echo('Done')

        remote_container_repo = remote_container_repo or \
            os.path.join(
                DEFAULT_REMOTE_CONTAINER_REPO_HOST,
                get_config('gcp_project_id'))
        remote_tag = os.path.join(remote_container_repo,
                                  f'{image_name}:{version_tag}')
        image.tag(remote_tag)
        latest_tag = os.path.join(remote_container_repo,
                                  f'{image_name}:LATEST')
        image.tag(latest_tag)
        click.echo(f'Pushing images to {remote_container_repo}')
        click.echo('\t' + remote_tag)
        click.echo('\t' + latest_tag)
        client.images.push(remote_tag)
        client.images.push(latest_tag)
        click.echo('Done')
예제 #12
0
파일: airflow.py 프로젝트: ayemos/fuga
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_kuberenetes_pod_operator(..., dag=dag)`. Defaults to None.

    Returns:
        airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator
    """

    if dag is None:
        logger.warning('No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI')

    if (operator_name is None and operator_image is None) or \
            (operator_name is not None and operator_image is not None):
        raise Exception('''You need to specify either one of `opertor_name` or `operator_image` param''')
    elif operator_name is not None:
        image = 'gcr.io/{gcp_project_name}/{experiment_name}_{operator_name}'\
            .format(
                gcp_project_name=get_config('gcp_project_name'),
                experiment_name=get_config('experiment_name'),
                operator_name=operator_name)
    elif operator_image is not None:
        image = operator_image

    image_tag = image_tag or 'LATEST'

    return KubernetesPodOperator(
        dag=dag or models._CONTEXT_MANAGER_DAG,
        task_id='{experiment_name}_{operator_name}'.format(
            experiment_name=get_config('experiment_name')
                .replace('-', '_'),
            operator_name=operator_name.replace('-', '_')),
        name='{experiment_name}__{operator_name}:{image_tag}'.format(
            experiment_name=get_config('experiment_name').replace('_', '-'),