示例#1
0
def get_config(provider, config_path=None):
    """
    Obtain provider's configuration data from the given file path, or the default file path if None.
    """
    if config_path:
        return mds.ConfigFile(config_path, provider).dump()
    elif pathlib.Path("./config.json").exists():
        return mds.ConfigFile("./config.json", provider).dump()
    else:
        return {}
示例#2
0
def load_to_s3_pgdb(**kwargs):
    """
    Python operator to load data to s3
    for an operator and the database
    """
    # load config from s3
    s3 = connect_aws_s3()

    try:
        s3.Bucket("city-of-los-angeles-data-lake").download_file(
            "dockless/config.json", "/tmp/config.json"
        )
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            print("The object does not exist.")
        else:
            raise
    company = kwargs["params"]["company"]
    config = mds.ConfigFile("/tmp/config.json", company)
    logging.info("Downloaded and parsed config from S3")
    # assert the version parameter
    version = getattr(config, "version", "0.3.2")
    # set company
    logging.info(f"set company to {company}")
    logging.info(f"Referencing MDS @ {version}")
    # load company
    client = mds.Client(company, config, version=version)
    execution_date = kwargs["execution_date"]
    # test is provider is jump, up hours because their ETL is slow.
    if client.provider.provider_id == "c20e08cf-8488-46a6-a66c-5d8fb827f7e0":
        end_time = execution_date - timedelta(hours=25)
        start_time = end_time - timedelta(hours=12)
    else:
        end_time = execution_date
        start_time = end_time - timedelta(hours=12)
    status_changes = client.get_status_changes(end_time=end_time, start_time=start_time)

    obj = s3.Object(
        "city-of-los-angeles-data-lake",
        f"dockless/data/{company}/status_changes/{kwargs['ts']}.json",
    )
    obj.put(Body=json.dumps(status_changes))
    logging.info(f"Wrote {company} status changes to s3")
    # query trips
    trips = client.get_trips(end_time=end_time, start_time=start_time)
    obj = s3.Object(
        "city-of-los-angeles-data-lake",
        f"dockless/data/{company}/trips/{kwargs['ts']}.json",
    )
    obj.put(Body=json.dumps(trips))
    logging.info(f"Wrote {company} trips to s3")
    logging.info("Connecting to DB")
    logging.info(f"Logging into postgres")
    db = mds.Database(uri=POSTGRES_URI, version=version)

    if len(status_changes) != 0:
        logging.info("loading {company} status changes into DB")
        db.load_status_changes(
            source=status_changes, stage_first=5, before_load=normalize_status_changes
        )
    else:
        logging.info(
            "Warning: not loading status change data for {company} as no data was "
            "received"
        )

    if len(trips) != 0:
        logging.info("loading {company} trips into DB")
        db.load_trips(source=trips, stage_first=5, before_load=normalize_trips)
    else:
        logging.info(
            "Warning: not loading trip data for {company} as no data was received"
        )

    return True
示例#3
0
def load_to_s3(**kwargs):
    """
    Python operator to load data to s3 
    for an operator and the database
    """
    # load config from s3
    s3 = connect_aws_s3()

    try:
        s3.Bucket('city-of-los-angeles-data-lake').download_file(
            'dockless/config.json', '/tmp/config.json')
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise
    company = kwargs['params']['company']
    config = mds.ConfigFile('/tmp/config.json', company)
    logging.info("Downloaded and parsed config from S3")
    # assert the version parameter
    version = getattr(config, 'version', '0.3.2')
    # set company
    logging.info(f"set company to {company}")
    logging.info(f"Referencing MDS @ {version}")
    # load company
    client = mds.Client(company, config, version=version)
    end_time = kwargs['execution_date']
    ## test is provider is jump, up hours because their ETL is slow.
    if client.provider.provider_id == 'c20e08cf-8488-46a6-a66c-5d8fb827f7e0':
        start_time = end_time - timedelta(hours=25)
    else:
        start_time = end_time - timedelta(hours=12)
    status_changes = client.get_status_changes(end_time=end_time,
                                               start_time=start_time)

    obj = s3.Object(
        'city-of-los-angeles-data-lake',
        f"dockless/data/{company}/status_changes/{kwargs['ts']}.json")
    obj.put(Body=json.dumps(status_changes))
    logging.info(f"Wrote {company} status changes to s3")
    # query trips
    trips = client.get_trips(end_time=end_time, start_time=start_time)
    obj = s3.Object('city-of-los-angeles-data-lake',
                    f"dockless/data/{company}/trips/{kwargs['ts']}.json")
    obj.put(Body=json.dumps(trips))
    logging.info(f"Wrote {company} trips to s3")
    logging.info("Connecting to DB")
    user = pg_conn.login
    password = pg_conn.get_password()
    host = pg_conn.host
    dbname = pg_conn.schema
    logging.info(f"Logging into postgres://-----:----@{host}:5432/{dbname}")
    db = mds.Database(uri=f'postgres://{user}:{password}@{host}:5432/{dbname}',
                      version=version)
    logging.info("loading {company} status changes into DB")
    db.load_status_changes(source=status_changes,
                           stage_first=5,
                           before_load=normalize_status_changes)

    logging.info("loading {company} trips into DB")
    db.load_trips(source=trips, stage_first=5, before_load=normalize_trips)
    return True