Python get_bucket示例

编程语言: Python

命名空间/包名称: calitp

方法/功能: get_bucket

hotexamples.com的示例: 3

Python get_bucket - 已找到3个示例。这些是从开源项目中提取的最受好评的calitp.get_bucket现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： calitp_files_process.py 项目： mcplanner/data-infra

def main(execution_date, **kwargs):
    # TODO: remove hard-coded project string
    fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra")

    bucket = get_bucket()

    f = read_gcfs(f"schedule/{execution_date}/status.csv")
    status = pd.read_csv(f)

    success = status[lambda d: d.status == "success"]

    gtfs_files = []
    for ii, row in success.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*"

        gtfs_files.append(fs.glob(gtfs_url))

    res = (success[["itp_id",
                    "url_number"]].assign(gtfs_file=gtfs_files).explode(
                        "gtfs_file").loc[lambda d: d.gtfs_file != "processed"])

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )

示例#2

显示文件

    def execute(self, context):
        dst_table_name = format_table_name(self.dst_table_name,
                                           is_staging=True)

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        bucket = get_bucket()
        src_uris = f"{bucket}/{self.src_uris}"

        cursor.run_load(
            dst_table_name,
            source_uris=src_uris,
            schema_fields=self.schema_fields,
            autodetect=self.autodetect,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
        )

示例#3

显示文件

    def __new__(
        cls,
        parent_id,
        gcs_dirs_xcom,
        dst_dir,
        filename,
        schema_fields,
        table_name,
        task_id,
        dag,
    ):
        from airflow.utils.dates import days_ago

        args = {
            "start_date": days_ago(2),
        }

        bucket = get_bucket().replace("gs://", "", 1)
        full_table_name = format_table_name(table_name, is_staging=True)

        subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args)

        column_names = [schema["name"] for schema in schema_fields]

        # by convention, preface task names with dag_id
        op_col_select = PythonTaskflowOperator(
            task_id="select_cols",
            python_callable=_keep_columns,
            # note that this input should have form schedule/{execution_date}/...
            taskflow={
                "gcs_dirs": {
                    "dag_id": parent_id,
                    "task_ids": gcs_dirs_xcom
                }
            },
            op_kwargs={
                "dst_dir": dst_dir,
                "filename": filename,
                "required_cols": [],
                "optional_cols": column_names,
            },
            dag=subdag,
        )

        op_stage_bq = GoogleCloudStorageToBigQueryOperator(
            task_id="stage_bigquery",
            bucket=bucket,
            # note that we can't really pull a list out of xcom without subclassing
            # operators, so we really on knowing that the task passing in
            # gcs_dirs_xcom data is using schedule/{execution_date}
            source_objects=[
                "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename)
            ],
            schema_fields=schema_fields,
            destination_project_dataset_table=full_table_name,
            create_disposition="CREATE_IF_NEEDED",
            write_disposition="WRITE_TRUNCATE",
            # _keep_columns function includes headers in output
            skip_leading_rows=1,
            dag=subdag,
        )

        op_col_select >> op_stage_bq

        return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)