def get_data(record_type, **kwargs):
    """
    Get provider data as in-memory objects.
    """
    # shortcut reading from file source(s)
    if kwargs.get("source"):
        source = kwargs.get("source")
        print(f"Reading {record_type} from {source}")
        payloads = mds.DataFile(record_type, source).load_payloads()
        return payloads

    # required for API calls
    client = kwargs.pop("client")

    # dependent on version and record_type
    start_time = kwargs.get("start_time")
    end_time = kwargs.get("end_time")

    paging = not kwargs.get("no_paging")
    rate_limit = kwargs.get("rate_limit")
    version = kwargs.get("version", DEFAULT_VERSION)

    # package up for API requests
    api_kwargs = dict(paging=paging, rate_limit=rate_limit)

    print(f"Requesting {record_type} from {client.provider.provider_name}")
    if start_time and end_time:
        print(f"For time range: {start_time.isoformat()} to {end_time.isoformat()}")
    elif end_time:
        print(f"For time: {end_time.isoformat()}")

    if version < VERSION_040:
        if record_type == mds.STATUS_CHANGES:
            api_kwargs["start_time"] = start_time
            api_kwargs["end_time"] = end_time
        elif record_type == mds.TRIPS:
            api_kwargs["min_end_time"] = start_time
            api_kwargs["max_end_time"] = end_time
            api_kwargs["device_id"] = kwargs.get("device_id")
            api_kwargs["vehicle_id"] = kwargs.get("vehicle_id")
    else:
        if record_type == mds.EVENTS:
            api_kwargs["start_time"] = start_time
            api_kwargs["end_time"] = end_time
        elif record_type == mds.STATUS_CHANGES:
            api_kwargs["event_time"] = end_time
        elif record_type == mds.TRIPS:
            api_kwargs["end_time"] = end_time
        elif record_type == mds.VEHICLES:
            # currently no special query params for vehicles
            pass

    return client.get(record_type, **api_kwargs)
def ingest(record_type, **kwargs):
    """
    Run the ingestion flow:

    1. acquire data from files or API
    2. optionally validate data, filtering invalid records
    3. optionally write data to output files
    4. optionally load valid records into the database
    """
    version = mds.Version(kwargs.pop("version", common.DEFAULT_VERSION))
    version.raise_if_unsupported()

    datasource = common.get_data(record_type, **kwargs, version=version)
    data_key = mds.Schema(record_type).data_key

    # validation and filtering
    if not kwargs.pop("no_validate", False):
        print(f"Validating {record_type} @ {version}")

        valid, errors, removed = validation.validate(record_type, datasource, version=version)

        seen = sum([len(d["data"][data_key]) for d in datasource])
        passed = sum([len(v["data"][data_key]) for v in valid])
        failed = sum([len(r["data"][data_key]) for r in removed])

        print(f"{seen} records, {passed} passed, {failed} failed")
    else:
        print("Skipping data validation")
        valid = datasource
        removed = None

    # output to files if needed
    output = kwargs.pop("output", None)
    if output:
        f = mds.DataFile(record_type, output)
        f.dump_payloads(valid)
        if removed:
            f.dump_payloads(removed)

    # load to database
    loading = not kwargs.pop("no_load", False)
    if loading and len(valid) > 0:
        database.load(valid, record_type, **kwargs, version=version)
    else:
        print("Skipping data load")

    print(f"{record_type} complete")
示例#3
0
def get_data(record_type, **kwargs):
    """
    Get provider data as in-memory objects.
    """
    if kwargs.get("source"):
        source = kwargs.get("source")
        print(f"Reading {record_type} from {source}")
        payloads = mds.DataFile(record_type, source).load_payloads()
        return payloads

    # required for API calls
    client = kwargs.pop("client")
    start_time = kwargs.pop("start_time")
    end_time = kwargs.pop("end_time")

    paging = not kwargs.get("no_paging")
    rate_limit = kwargs.get("rate_limit")
    version = kwargs.get("version")

    # package up for API requests
    api_kwargs = dict(paging=paging, rate_limit=rate_limit)

    print(f"Requesting {record_type} from {client.provider.provider_name}")
    print(f"Time range: {start_time.isoformat()} to {end_time.isoformat()}")

    if record_type == mds.STATUS_CHANGES:
        api_kwargs["start_time"] = start_time
        api_kwargs["end_time"] = end_time
    elif record_type == mds.TRIPS:
        api_kwargs["device_id"] = kwargs.get("device_id")
        api_kwargs["vehicle_id"] = kwargs.get("vehicle_id")

        if version < mds.Version("0.3.0"):
            api_kwargs["start_time"] = start_time
            api_kwargs["end_time"] = end_time
        else:
            api_kwargs["min_end_time"] = start_time
            api_kwargs["max_end_time"] = end_time

    return client.get(record_type, **api_kwargs)
示例#4
0
        day_status_changes, day_trips = gen.service_day(
            devices, date, hour_open, hour_closed, inactivity)
        status_changes.extend(day_status_changes)
        trips.extend(day_trips)
        date = date + datetime.timedelta(days=1)

        print(f"Finished day: {formatted_date} ({time.time() - t2} s)")

    print(f"Finished generating data ({time.time() - t1} s)")

    if len(status_changes) > 0 or len(trips) > 0:
        print("Generating data files")
        t1 = time.time()

        trips_file = mds.DataFile(mds.TRIPS, outputdir)

        print("Writing trips")

        t2 = time.time()
        payload = gen.make_payload(trips=trips)
        trips_file.dump_payloads(payload)

        print(f"Finished ({time.time() - t2} s)")

        sc_file = mds.DataFile(mds.STATUS_CHANGES, outputdir)

        print("Writing status_changes")

        t2 = time.time()
        payload = gen.make_payload(status_changes=status_changes)
示例#5
0
            if len(errors) > 0:
                print(f"  Errors ({len(errors)} total)")
                for error in errors:
                    print()
                    try:
                        for line in error.describe():
                            print(f"    {line}")
                    except:
                        print(error)

            if args.output:
                print()
                print(f"Writing {record_type} to {args.output}")

                f = mds.DataFile(record_type, args.output)

                f.dump_payloads(
                    original,
                    file_name=f"{source}_{record_type}_original.json")
                f.dump_payloads(valid,
                                file_name=f"{source}_{record_type}_valid.json")

                if len(invalid) > 0:
                    f.dump_payloads(
                        invalid,
                        file_name=f"{source}_{record_type}_invalid.json")

    print()
    print(f"Finished validation ({common.count_seconds(now)}s)")