Exemplo n.º 1
0
 def _ingest_request(self):
     """ Interate through the metrics and create an IngestRequest
     """
     self._update_service_info()
     request = IngestRequest(reporter=self._reporter)
     request.idempotency_key = self._generate_idempotency_key()
     start_time = Timestamp()
     start_time.GetCurrentTime()
     duration = Duration()
     duration.FromSeconds(self._intervals * self._flush_interval)
     for metric in self._runtime_metrics:
         metric_type = MetricKind.GAUGE
         if len(metric) == 3:
             key, value, metric_type = metric
         else:
             key, value = metric
         request.points.add(
             duration=duration,
             start=start_time,
             labels=self._labels,
             metric_name=key,
             double_value=value,
             kind=metric_type,
         )
     _log.debug("Metrics collected: %s", request)
     return request
Exemplo n.º 2
0
    def to_internal_job(self, data_store):
        # There should never be more than one active lease for a job. If we
        # have more than one for some reason, just take the first one.
        # TODO(SotK): Log some information here if there are multiple active
        # (ie. not completed or cancelled) leases.
        lease = self.active_leases[0].to_protobuf(
        ) if self.active_leases else None
        q_timestamp = Timestamp()
        if self.queued_timestamp:
            q_timestamp.FromDatetime(self.queued_timestamp)
        q_time_duration = Duration()
        if self.queued_time_duration:
            q_time_duration.FromSeconds(self.queued_time_duration)
        ws_timestamp = Timestamp()
        if self.worker_start_timestamp:
            ws_timestamp.FromDatetime(self.worker_start_timestamp)
        wc_timestamp = Timestamp()
        if self.worker_completed_timestamp:
            wc_timestamp.FromDatetime(self.worker_completed_timestamp)

        requirements = {}
        for req in self.platform_requirements:
            values = requirements.setdefault(req.key, set())
            values.add(req.value)

        if self.name in data_store.response_cache:
            result = data_store.response_cache[self.name]
        elif self.result is not None:
            result_digest = string_to_digest(self.result)
            result = data_store.storage.get_message(result_digest,
                                                    ExecuteResponse)
        else:
            result = None

        return job.Job(self.do_not_cache,
                       string_to_digest(self.action_digest),
                       platform_requirements=requirements,
                       priority=self.priority,
                       name=self.name,
                       operations=[op.to_protobuf() for op in self.operations],
                       cancelled_operations=set(op.name
                                                for op in self.operations
                                                if op.cancelled),
                       lease=lease,
                       stage=self.stage,
                       cancelled=self.cancelled,
                       queued_timestamp=q_timestamp,
                       queued_time_duration=q_time_duration,
                       worker_start_timestamp=ws_timestamp,
                       worker_completed_timestamp=wc_timestamp,
                       done=all(op.done for op in self.operations)
                       and len(self.operations) > 0,
                       result=result,
                       worker_name=self.active_leases[0].worker_name
                       if self.active_leases else None,
                       n_tries=self.n_tries)
Exemplo n.º 3
0
def test_historical_features(
    feast_client: Client, batch_source: Union[BigQuerySource, FileSource]
):
    customer_entity = Entity(
        name="user_id", description="Customer", value_type=ValueType.INT64
    )
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    job = feast_client.get_historical_features(feature_refs, customers_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame(
        {
            "event_timestamp": customers_df.event_timestamp.tolist(),
            "user_id": customers_df.user_id.tolist(),
            "transactions__daily_transactions": transactions_df.daily_transactions.tolist()
            + [None] * transactions_df.shape[0],
        }
    )

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True),
        expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True
        ),
    )
Exemplo n.º 4
0
def bookings_feature_table_with_mapping(spark, client):
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("datetime", TimestampType()),
        StructField("created_datetime", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource(
        event_timestamp_column="datetime",
        created_timestamp_column="created_datetime",
        file_format=ParquetFormat(),
        file_url=file_uri,
        field_mapping={"id": "driver_id"},
    )
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply(feature_table)
    shutil.rmtree(temp_dir)
def bookings_feature_table(spark, client):
    schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=2),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=2),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource("event_timestamp", "created_timestamp", "parquet",
                             file_uri)
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply_feature_table(feature_table)
    shutil.rmtree(temp_dir)
Exemplo n.º 6
0
    def done(context, message, args):
        assert Fact._provider is not None
        assert Fact.config["io"].connected

        Fact._trace.EndTime.CopyFrom(Fact.now())

        # convert timestamp to millis
        key = int(datetime.now().timestamp() * 1000)

        Fact._trace.Logs[key] = message
        Fact._trace.Args.extend(args)

        # duration of execution calculation and formatting
        duration = Duration()
        exec_time = Fact._trace.EndTime.seconds - Fact._trace.StartTime.seconds
        duration.FromSeconds(exec_time)
        Fact._trace.ExecutionLatency.CopyFrom(duration)

        Fact._provider.collect(Fact._trace, context)

        if "send_on_update" in Fact.config and Fact.config["send_on_update"]:
            Fact.send("done")

        return Fact._trace
Exemplo n.º 7
0
def test_historical_features(
    feast_client: Client,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(output_dir,
                             azure_account_name=account_name,
                             azure_account_key=account_key)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )

    job = tfrecord_feast_client.get_historical_features(
        feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED
def test_historical_features(feast_client: Client, local_staging_path: str):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply_entity(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=FileSource(
            "event_timestamp",
            "created_timestamp",
            ParquetFormat(),
            os.path.join(local_staging_path, "transactions"),
        ),
        max_age=max_age,
    )

    feast_client.apply_feature_table(transactions_feature_table)

    retrieval_date = (datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None))
    retrieval_outside_max_age_date = retrieval_date + timedelta(1)
    event_date = retrieval_date - timedelta(2)
    creation_date = retrieval_date - timedelta(1)

    customers = [1001, 1002, 1003, 1004, 1005]
    daily_transactions = [np.random.rand() * 10 for _ in customers]
    total_transactions = [np.random.rand() * 100 for _ in customers]

    transactions_df = pd.DataFrame({
        "event_timestamp": [event_date for _ in customers],
        "created_timestamp": [creation_date for _ in customers],
        "user_id":
        customers,
        "daily_transactions":
        daily_transactions,
        "total_transactions":
        total_transactions,
    })

    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    customer_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
    })

    job = feast_client.get_historical_features(feature_refs, customer_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame({
        "event_timestamp": [retrieval_date for _ in customers] +
        [retrieval_outside_max_age_date for _ in customers],
        "user_id":
        customers + customers,
        "transactions__daily_transactions":
        daily_transactions + [None] * len(customers),
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )