Пример #1
0
def load(datasource, record_type, **kwargs):
    """
    Load data into a database.
    """
    print(f"Loading {record_type}")

    columns = kwargs.pop("columns", [])
    if len(columns) == 0:
        columns = COLUMNS[record_type]

    actions = kwargs.pop("update_actions", [])

    if len(actions) == 1 and actions[0] is True:
        # flag-only option, use defaults
        actions = UPDATE_ACTIONS[record_type]
    elif len(actions) > 1:
        # convert action tuples to dict, filtering any flag-only options
        actions = dict(filter(lambda x: x is not True, actions))

    conflict_update = len(actions) > 0

    version = mds.Version(kwargs.pop("version", common.default_version))
    stage_first = int(kwargs.pop("stage_first", True))

    db_config = dict(stage_first=stage_first, version=version, **env())
    db = kwargs.get("db", mds.Database(**db_config))

    load_config = dict(table=record_type, drop_duplicates=columns)
    if record_type == mds.STATUS_CHANGES:
        load_config["on_conflict_update"] = status_changes_conflict_update(columns, actions, version) if conflict_update else None
        db.load_status_changes(datasource, **load_config)
    elif record_type == mds.TRIPS:
        load_config["on_conflict_update"] = trips_conflict_update(columns, actions, version) if conflict_update else None
        db.load_trips(datasource, **load_config)
Пример #2
0
def load(datasource, record_type, **kwargs):
    """
    Load data into a database.
    """
    print(f"Loading {record_type}")

    version = mds.Version(kwargs.pop("version", common.DEFAULT_VERSION))
    version.raise_if_unsupported()

    if version < mds.Version._040_() and record_type not in [mds.STATUS_CHANGES, mds.TRIPS]:
        raise ValueError(f"MDS Version {version} only supports {STATUS_CHANGES} and {TRIPS}.")
    elif version < mds.Version._041_() and record_type == mds.VEHICLES:
        raise ValueError(f"MDS Version {version} does not support the {VEHICLES} endpoint.")

    columns = kwargs.pop("columns", [])
    if len(columns) == 0:
        columns = COLUMNS[record_type]

    actions = kwargs.pop("update_actions", [])

    if len(actions) == 1 and actions[0] is True:
        # flag-only option, use defaults
        actions = default_conflict_update_actions(record_type, version)
    elif len(actions) > 1:
        # convert action tuples to dict, filtering any flag-only options
        actions = dict(filter(lambda x: x is not True, actions))

    stage_first = int(kwargs.pop("stage_first", True))

    db_config = dict(stage_first=stage_first, version=version, **env())
    db = kwargs.get("db", mds.Database(**db_config))

    load_config = dict(table=record_type, drop_duplicates=columns)
    if len(actions) > 0:
        load_config["on_conflict_update"] = conflict_update_condition(columns), actions

    if record_type == mds.EVENTS:
        db.load_events(datasource, **load_config)
    elif record_type == mds.STATUS_CHANGES:
        db.load_status_changes(datasource, **load_config)
    elif record_type == mds.TRIPS:
        db.load_trips(datasource, **load_config)
    elif record_type == mds.VEHICLES:
        db.load_vehicles(datasource, **load_config)
Пример #3
0
def load_to_s3_pgdb(**kwargs):
    """
    Python operator to load data to s3
    for an operator and the database
    """
    # load config from s3
    s3 = connect_aws_s3()

    try:
        s3.Bucket("city-of-los-angeles-data-lake").download_file(
            "dockless/config.json", "/tmp/config.json"
        )
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            print("The object does not exist.")
        else:
            raise
    company = kwargs["params"]["company"]
    config = mds.ConfigFile("/tmp/config.json", company)
    logging.info("Downloaded and parsed config from S3")
    # assert the version parameter
    version = getattr(config, "version", "0.3.2")
    # set company
    logging.info(f"set company to {company}")
    logging.info(f"Referencing MDS @ {version}")
    # load company
    client = mds.Client(company, config, version=version)
    execution_date = kwargs["execution_date"]
    # test is provider is jump, up hours because their ETL is slow.
    if client.provider.provider_id == "c20e08cf-8488-46a6-a66c-5d8fb827f7e0":
        end_time = execution_date - timedelta(hours=25)
        start_time = end_time - timedelta(hours=12)
    else:
        end_time = execution_date
        start_time = end_time - timedelta(hours=12)
    status_changes = client.get_status_changes(end_time=end_time, start_time=start_time)

    obj = s3.Object(
        "city-of-los-angeles-data-lake",
        f"dockless/data/{company}/status_changes/{kwargs['ts']}.json",
    )
    obj.put(Body=json.dumps(status_changes))
    logging.info(f"Wrote {company} status changes to s3")
    # query trips
    trips = client.get_trips(end_time=end_time, start_time=start_time)
    obj = s3.Object(
        "city-of-los-angeles-data-lake",
        f"dockless/data/{company}/trips/{kwargs['ts']}.json",
    )
    obj.put(Body=json.dumps(trips))
    logging.info(f"Wrote {company} trips to s3")
    logging.info("Connecting to DB")
    logging.info(f"Logging into postgres")
    db = mds.Database(uri=POSTGRES_URI, version=version)

    if len(status_changes) != 0:
        logging.info("loading {company} status changes into DB")
        db.load_status_changes(
            source=status_changes, stage_first=5, before_load=normalize_status_changes
        )
    else:
        logging.info(
            "Warning: not loading status change data for {company} as no data was "
            "received"
        )

    if len(trips) != 0:
        logging.info("loading {company} trips into DB")
        db.load_trips(source=trips, stage_first=5, before_load=normalize_trips)
    else:
        logging.info(
            "Warning: not loading trip data for {company} as no data was received"
        )

    return True
Пример #4
0
def load_to_s3(**kwargs):
    """
    Python operator to load data to s3 
    for an operator and the database
    """
    # load config from s3
    s3 = connect_aws_s3()

    try:
        s3.Bucket('city-of-los-angeles-data-lake').download_file(
            'dockless/config.json', '/tmp/config.json')
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise
    company = kwargs['params']['company']
    config = mds.ConfigFile('/tmp/config.json', company)
    logging.info("Downloaded and parsed config from S3")
    # assert the version parameter
    version = getattr(config, 'version', '0.3.2')
    # set company
    logging.info(f"set company to {company}")
    logging.info(f"Referencing MDS @ {version}")
    # load company
    client = mds.Client(company, config, version=version)
    end_time = kwargs['execution_date']
    ## test is provider is jump, up hours because their ETL is slow.
    if client.provider.provider_id == 'c20e08cf-8488-46a6-a66c-5d8fb827f7e0':
        start_time = end_time - timedelta(hours=25)
    else:
        start_time = end_time - timedelta(hours=12)
    status_changes = client.get_status_changes(end_time=end_time,
                                               start_time=start_time)

    obj = s3.Object(
        'city-of-los-angeles-data-lake',
        f"dockless/data/{company}/status_changes/{kwargs['ts']}.json")
    obj.put(Body=json.dumps(status_changes))
    logging.info(f"Wrote {company} status changes to s3")
    # query trips
    trips = client.get_trips(end_time=end_time, start_time=start_time)
    obj = s3.Object('city-of-los-angeles-data-lake',
                    f"dockless/data/{company}/trips/{kwargs['ts']}.json")
    obj.put(Body=json.dumps(trips))
    logging.info(f"Wrote {company} trips to s3")
    logging.info("Connecting to DB")
    user = pg_conn.login
    password = pg_conn.get_password()
    host = pg_conn.host
    dbname = pg_conn.schema
    logging.info(f"Logging into postgres://-----:----@{host}:5432/{dbname}")
    db = mds.Database(uri=f'postgres://{user}:{password}@{host}:5432/{dbname}',
                      version=version)
    logging.info("loading {company} status changes into DB")
    db.load_status_changes(source=status_changes,
                           stage_first=5,
                           before_load=normalize_status_changes)

    logging.info("loading {company} trips into DB")
    db.load_trips(source=trips, stage_first=5, before_load=normalize_trips)
    return True