def load(datasource, record_type, **kwargs): """ Load data into a database. """ print(f"Loading {record_type}") columns = kwargs.pop("columns", []) if len(columns) == 0: columns = COLUMNS[record_type] actions = kwargs.pop("update_actions", []) if len(actions) == 1 and actions[0] is True: # flag-only option, use defaults actions = UPDATE_ACTIONS[record_type] elif len(actions) > 1: # convert action tuples to dict, filtering any flag-only options actions = dict(filter(lambda x: x is not True, actions)) conflict_update = len(actions) > 0 version = mds.Version(kwargs.pop("version", common.default_version)) stage_first = int(kwargs.pop("stage_first", True)) db_config = dict(stage_first=stage_first, version=version, **env()) db = kwargs.get("db", mds.Database(**db_config)) load_config = dict(table=record_type, drop_duplicates=columns) if record_type == mds.STATUS_CHANGES: load_config["on_conflict_update"] = status_changes_conflict_update(columns, actions, version) if conflict_update else None db.load_status_changes(datasource, **load_config) elif record_type == mds.TRIPS: load_config["on_conflict_update"] = trips_conflict_update(columns, actions, version) if conflict_update else None db.load_trips(datasource, **load_config)
def load(datasource, record_type, **kwargs): """ Load data into a database. """ print(f"Loading {record_type}") version = mds.Version(kwargs.pop("version", common.DEFAULT_VERSION)) version.raise_if_unsupported() if version < mds.Version._040_() and record_type not in [mds.STATUS_CHANGES, mds.TRIPS]: raise ValueError(f"MDS Version {version} only supports {STATUS_CHANGES} and {TRIPS}.") elif version < mds.Version._041_() and record_type == mds.VEHICLES: raise ValueError(f"MDS Version {version} does not support the {VEHICLES} endpoint.") columns = kwargs.pop("columns", []) if len(columns) == 0: columns = COLUMNS[record_type] actions = kwargs.pop("update_actions", []) if len(actions) == 1 and actions[0] is True: # flag-only option, use defaults actions = default_conflict_update_actions(record_type, version) elif len(actions) > 1: # convert action tuples to dict, filtering any flag-only options actions = dict(filter(lambda x: x is not True, actions)) stage_first = int(kwargs.pop("stage_first", True)) db_config = dict(stage_first=stage_first, version=version, **env()) db = kwargs.get("db", mds.Database(**db_config)) load_config = dict(table=record_type, drop_duplicates=columns) if len(actions) > 0: load_config["on_conflict_update"] = conflict_update_condition(columns), actions if record_type == mds.EVENTS: db.load_events(datasource, **load_config) elif record_type == mds.STATUS_CHANGES: db.load_status_changes(datasource, **load_config) elif record_type == mds.TRIPS: db.load_trips(datasource, **load_config) elif record_type == mds.VEHICLES: db.load_vehicles(datasource, **load_config)
def load_to_s3_pgdb(**kwargs): """ Python operator to load data to s3 for an operator and the database """ # load config from s3 s3 = connect_aws_s3() try: s3.Bucket("city-of-los-angeles-data-lake").download_file( "dockless/config.json", "/tmp/config.json" ) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": print("The object does not exist.") else: raise company = kwargs["params"]["company"] config = mds.ConfigFile("/tmp/config.json", company) logging.info("Downloaded and parsed config from S3") # assert the version parameter version = getattr(config, "version", "0.3.2") # set company logging.info(f"set company to {company}") logging.info(f"Referencing MDS @ {version}") # load company client = mds.Client(company, config, version=version) execution_date = kwargs["execution_date"] # test is provider is jump, up hours because their ETL is slow. if client.provider.provider_id == "c20e08cf-8488-46a6-a66c-5d8fb827f7e0": end_time = execution_date - timedelta(hours=25) start_time = end_time - timedelta(hours=12) else: end_time = execution_date start_time = end_time - timedelta(hours=12) status_changes = client.get_status_changes(end_time=end_time, start_time=start_time) obj = s3.Object( "city-of-los-angeles-data-lake", f"dockless/data/{company}/status_changes/{kwargs['ts']}.json", ) obj.put(Body=json.dumps(status_changes)) logging.info(f"Wrote {company} status changes to s3") # query trips trips = client.get_trips(end_time=end_time, start_time=start_time) obj = s3.Object( "city-of-los-angeles-data-lake", f"dockless/data/{company}/trips/{kwargs['ts']}.json", ) obj.put(Body=json.dumps(trips)) logging.info(f"Wrote {company} trips to s3") logging.info("Connecting to DB") logging.info(f"Logging into postgres") db = mds.Database(uri=POSTGRES_URI, version=version) if len(status_changes) != 0: logging.info("loading {company} status changes into DB") db.load_status_changes( source=status_changes, stage_first=5, before_load=normalize_status_changes ) else: logging.info( "Warning: not loading status change data for {company} as no data was " "received" ) if len(trips) != 0: logging.info("loading {company} trips into DB") db.load_trips(source=trips, stage_first=5, before_load=normalize_trips) else: logging.info( "Warning: not loading trip data for {company} as no data was received" ) return True
def load_to_s3(**kwargs): """ Python operator to load data to s3 for an operator and the database """ # load config from s3 s3 = connect_aws_s3() try: s3.Bucket('city-of-los-angeles-data-lake').download_file( 'dockless/config.json', '/tmp/config.json') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("The object does not exist.") else: raise company = kwargs['params']['company'] config = mds.ConfigFile('/tmp/config.json', company) logging.info("Downloaded and parsed config from S3") # assert the version parameter version = getattr(config, 'version', '0.3.2') # set company logging.info(f"set company to {company}") logging.info(f"Referencing MDS @ {version}") # load company client = mds.Client(company, config, version=version) end_time = kwargs['execution_date'] ## test is provider is jump, up hours because their ETL is slow. if client.provider.provider_id == 'c20e08cf-8488-46a6-a66c-5d8fb827f7e0': start_time = end_time - timedelta(hours=25) else: start_time = end_time - timedelta(hours=12) status_changes = client.get_status_changes(end_time=end_time, start_time=start_time) obj = s3.Object( 'city-of-los-angeles-data-lake', f"dockless/data/{company}/status_changes/{kwargs['ts']}.json") obj.put(Body=json.dumps(status_changes)) logging.info(f"Wrote {company} status changes to s3") # query trips trips = client.get_trips(end_time=end_time, start_time=start_time) obj = s3.Object('city-of-los-angeles-data-lake', f"dockless/data/{company}/trips/{kwargs['ts']}.json") obj.put(Body=json.dumps(trips)) logging.info(f"Wrote {company} trips to s3") logging.info("Connecting to DB") user = pg_conn.login password = pg_conn.get_password() host = pg_conn.host dbname = pg_conn.schema logging.info(f"Logging into postgres://-----:----@{host}:5432/{dbname}") db = mds.Database(uri=f'postgres://{user}:{password}@{host}:5432/{dbname}', version=version) logging.info("loading {company} status changes into DB") db.load_status_changes(source=status_changes, stage_first=5, before_load=normalize_status_changes) logging.info("loading {company} trips into DB") db.load_trips(source=trips, stage_first=5, before_load=normalize_trips) return True