def main(execution_date, **kwargs): # TODO: remove hard-coded project string fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra") bucket = get_bucket() f = read_gcfs(f"schedule/{execution_date}/status.csv") status = pd.read_csv(f) success = status[lambda d: d.status == "success"] gtfs_files = [] for ii, row in success.iterrows(): agency_folder = f"{row.itp_id}_{row.url_number}" gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*" gtfs_files.append(fs.glob(gtfs_url)) res = (success[["itp_id", "url_number"]].assign(gtfs_file=gtfs_files).explode( "gtfs_file").loc[lambda d: d.gtfs_file != "processed"]) save_to_gcfs( res.to_csv(index=False).encode(), f"schedule/{execution_date}/processed/files.csv", use_pipe=True, )
def execute(self, context): dst_table_name = format_table_name(self.dst_table_name, is_staging=True) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() bucket = get_bucket() src_uris = f"{bucket}/{self.src_uris}" cursor.run_load( dst_table_name, source_uris=src_uris, schema_fields=self.schema_fields, autodetect=self.autodetect, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, )
def __new__( cls, parent_id, gcs_dirs_xcom, dst_dir, filename, schema_fields, table_name, task_id, dag, ): from airflow.utils.dates import days_ago args = { "start_date": days_ago(2), } bucket = get_bucket().replace("gs://", "", 1) full_table_name = format_table_name(table_name, is_staging=True) subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args) column_names = [schema["name"] for schema in schema_fields] # by convention, preface task names with dag_id op_col_select = PythonTaskflowOperator( task_id="select_cols", python_callable=_keep_columns, # note that this input should have form schedule/{execution_date}/... taskflow={ "gcs_dirs": { "dag_id": parent_id, "task_ids": gcs_dirs_xcom } }, op_kwargs={ "dst_dir": dst_dir, "filename": filename, "required_cols": [], "optional_cols": column_names, }, dag=subdag, ) op_stage_bq = GoogleCloudStorageToBigQueryOperator( task_id="stage_bigquery", bucket=bucket, # note that we can't really pull a list out of xcom without subclassing # operators, so we really on knowing that the task passing in # gcs_dirs_xcom data is using schedule/{execution_date} source_objects=[ "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename) ], schema_fields=schema_fields, destination_project_dataset_table=full_table_name, create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", # _keep_columns function includes headers in output skip_leading_rows=1, dag=subdag, ) op_col_select >> op_stage_bq return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)