def execute(self, context): df = airtable_to_df( self.air_base_id, self.air_table_name, self.id_name, self.rename_fields, self.column_prefix, self.api_key, ) if self.table_name: print(f"Writing table with shape: {df.shape}") write_table(df, self.table_name) if self.gcs_path: clean_gcs_path = re.sub(r"\/+$", "", self.gcs_path) gcs_file = ( f"{clean_gcs_path}/{context['execution_date']}/{self.table_name}.csv" ) print(f"Uploading to gcs at {gcs_file}") save_to_gcfs(df.to_csv(index=False).encode(), f"{gcs_file}", use_pipe=True)
def validation_notice_fields(): bucket = get_bucket() print(f"Globbing: {bucket}/schedule/processed/*/validation_report.json") fs = get_fs() reports = fs.glob(f"{bucket}/schedule/processed/*/validation_report.json") code_fields = defaultdict(lambda: set()) print(f"Iterating through {len(reports)} reports") for fname in reports: report = json.load(fs.open(fname)) # one entry per code (e.g. the code: invalid phone number) for notice in report["notices"]: # one entry per specific code violation (e.g. each invalid phone number) for entry in notice["notices"]: # map each code to the fields in its notice # (e.g. duplicate_route_name has a duplicatedField field for field_name, value in entry.items(): if isinstance(value, dict): # handle the few cases where there's one level of nesting sub_fields = [field_name + "." + v for v in value] code_fields[notice["code"]].update(sub_fields) else: # handle the common case of no sub-objects code_fields[notice["code"]].update(entry.keys()) validation_json_fields = pd.DataFrame({ "code": code_fields.keys(), "field": list(map(list, code_fields.values())) }).explode("field") write_table(validation_json_fields, "gtfs_schedule_history.validation_notice_fields")
# --- # operator: operators.PythonToWarehouseOperator # table_name: "gtfs_rt.validation_code_descriptions" # fields: # code: RT Validation error code name # description: A description of the validation error # is_critical: Whether this error is considered a Cal-ITP critical error # --- import pandas as pd from calitp import write_table, to_snakecase sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1GDDaDlsBPCYn3dtYPSABnce9ns3ekJ8Jzfgyy56lZz4/export?gid=617612870&format=csv" ) code_descriptions = pd.read_csv(sheet_url).pipe(to_snakecase) write_table(code_descriptions, "gtfs_rt.validation_code_descriptions")
import pandas as pd from calitp import get_engine, write_table from testing import Tester COLNAMES = ["x", "y"] df_has_null = pd.DataFrame([(1, None), (2, "b")], columns=COLNAMES) df_not_uniq = pd.DataFrame([(1, "a"), (1, "b")], columns=COLNAMES) df_not_composite_uniq = pd.DataFrame([(1, "a"), (2, "b"), (1, "a")], columns=COLNAMES) engine = get_engine() write_table(df_has_null, "sandbox.testing_has_null") write_table(df_not_uniq, "sandbox.testing_not_uniq") write_table(df_not_composite_uniq, "sandbox.testing_not_composite_uniq") # FAIL: nulls tester = Tester.from_tests( engine, "sandbox.testing_has_null", {"check_null": ["x", "y"]} ) print(tester.get_test_results()) assert not tester.all_passed() # PASS: no nulls tester = Tester.from_tests( engine, "sandbox.testing_not_uniq", {"check_null": ["x", "y"]} ) print(tester.get_test_results())
# operator: operators.PythonToWarehouseOperator # table_name: "gtfs_schedule_history.validation_code_descriptions" # fields: # severity: Severity of the error code (e.g. validation_codes.severity) # code: Code name (e.g. validation_codes.code) # --- import pandas as pd from calitp import write_table, to_snakecase sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1GDDaDlsBPCYn3dtYPSABnce9ns3ekJ8Jzfgyy56lZz4/export?gid=0&format=csv") code_descriptions = (pd.read_csv(sheet_url).pipe(to_snakecase).rename( columns={ "type": "severity", "name": "code" })) code_descriptions["code"] = (code_descriptions.code.str.replace( r"(?<!^)(?=[A-Z])", "_").str.lower().str.replace("_notice$", "").replace({ "i_o_error": "io_error", "u_r_i_syntax_error": "uri_syntax_error" })) write_table(code_descriptions, "gtfs_schedule_history.validation_code_descriptions")
# --- # operator: operators.PythonToWarehouseOperator # table_name: "sandbox.python_to_warehouse" # fields: # g: The g field python # x: The x field python # doc_md: | # This is an example of the PythonOperator. # # dependencies: # - create_dataset # --- import pandas as pd from calitp import write_table df = pd.DataFrame({"g": ["a", "b"], "x": [1, 2]}) write_table(df, "sandbox.python_to_warehouse")