Exemplo n.º 1
0
 def data_fetcher(self):
     return DataFetcher(
         DataSpec(
             time_series={
                 "ts1":
                 TimeSeriesSpec(id=1234,
                                start=3000,
                                end=5000,
                                aggregate="average",
                                granularity="1s"),
                 "ts2":
                 TimeSeriesSpec(id=2345,
                                start=3000,
                                end=5000,
                                aggregate="max",
                                granularity="1s"),
                 "ts3":
                 TimeSeriesSpec(id=3456,
                                start=4000,
                                end=9000,
                                aggregate="min",
                                granularity="1s"),
                 "ts4":
                 TimeSeriesSpec(external_id="abc",
                                start=3000,
                                end=5000,
                                aggregate="average",
                                granularity="1m"),
                 "ts5":
                 TimeSeriesSpec(id=5678, start=6000, end=8000),
             }))
Exemplo n.º 2
0
    def train(open_artifact, data_spec):
        """
        open_artifact:
            The train method must accept a open_artifact argument. This is a function
            that works the same way as the builtin open(), except it reads from
            and writes to the root of a special storage location in the cloud
            that belongs to the current model version.
        data_spec:
            An argument we pass in ourself when we initiate the training.
        api_key, project:
            Optional arguments that are passed in automatically from Model
            Hosting if you specify them.
        """
        data_fetcher = DataFetcher(data_spec)
        data_fetcher.files.fetch("data")
        data_fetcher.files.fetch("target")

        X = pd.read_csv("data")
        y = pd.read_csv("target")

        # Add a feature of constant value 1
        X.insert(0, "f0", 1)

        # Least squares
        coefficients = pd.DataFrame(np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y),
                                    columns=["beta_hat"])

        # Persist our result
        with open_artifact("coefficients.csv", "w") as f:
            coefficients.to_csv(f, index=False)
Exemplo n.º 3
0
 def data_fetcher(self, file_ids) -> DataFetcher:
     return DataFetcher(
         DataSpec(
             files={
                 "a": FileSpec(id=file_ids["a.txt"]),
                 "a_duplicate": FileSpec(id=file_ids["a.txt"]),
                 "b": FileSpec(id=file_ids["b.txt"]),
                 "big": FileSpec(id=file_ids["big.txt"]),
             }))
Exemplo n.º 4
0
 def data_fetcher(self, ts_ids, now) -> DataFetcher:
     one_hour_ago = now - 3600 * 1000
     return DataFetcher(
         DataSpec(
             time_series={
                 "constant3":
                 TimeSeriesSpec(
                     id=ts_ids["constant_3"], start=one_hour_ago, end=now),
                 "constant3_duplicate":
                 TimeSeriesSpec(
                     id=ts_ids["constant_3"], start=one_hour_ago, end=now),
                 "constant3_avg_1s":
                 TimeSeriesSpec(id=ts_ids["constant_3"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="average",
                                granularity="1s"),
                 "constant3_avg_1s_duplicate":
                 TimeSeriesSpec(id=ts_ids["constant_3"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="average",
                                granularity="1s"),
                 "constant3_avg_1m":
                 TimeSeriesSpec(id=ts_ids["constant_3"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="average",
                                granularity="1m"),
                 "constant3_max_1m":
                 TimeSeriesSpec(id=ts_ids["constant_3"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="max",
                                granularity="1m"),
                 "constant4":
                 TimeSeriesSpec(
                     id=ts_ids["constant_4"], start=one_hour_ago, end=now),
                 "constant4_avg_1s":
                 TimeSeriesSpec(id=ts_ids["constant_4"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="average",
                                granularity="1s"),
                 "constant5_min_1s":
                 TimeSeriesSpec(id=ts_ids["constant_5"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="min",
                                granularity="1s"),
                 "constant6_max_1s":
                 TimeSeriesSpec(id=ts_ids["constant_6"],
                                start=one_hour_ago,
                                end=now,
                                aggregate="max",
                                granularity="1s"),
             }))
Exemplo n.º 5
0
 def test_fetch_datapoints_single_many_datapoints(self, ts_ids, now):
     data_fetcher = DataFetcher(
         DataSpec(
             time_series={
                 "constant3":
                 TimeSeriesSpec(id=ts_ids["constant_3"],
                                start=now - 48 * 3600 * 1000,
                                end=now)
             }))
     df = data_fetcher.time_series.fetch_datapoints("constant3")
     self.assert_data_frame(df, ["value"], {"value": 3})
Exemplo n.º 6
0
 def test_fetch_datapoints_many_time_series(self, ts_ids, now):
     data_fetcher = DataFetcher(
         DataSpec(
             time_series={
                 "constant{}".format(i): TimeSeriesSpec(
                     id=ts_ids["constant_{}".format(i)],
                     start=now - 3600 * 1000,
                     end=now)
                 for i in range(100)
             }))
     dfs = data_fetcher.time_series.fetch_datapoints(
         ["constant{}".format(i) for i in range(100)])
     for i in range(100):
         self.assert_data_frame(dfs["constant{}".format(i)], ["value"],
                                {"value": i})
Exemplo n.º 7
0
 def test_fetch_datapoints_multiple_many_datapoints(self, ts_ids, now):
     data_fetcher = DataFetcher(
         DataSpec(
             time_series={
                 "constant3":
                 TimeSeriesSpec(id=ts_ids["constant_3"],
                                start=now - 48 * 3600 * 1000,
                                end=now),
                 "constant4":
                 TimeSeriesSpec(id=ts_ids["constant_4"],
                                start=now - 48 * 3600 * 1000,
                                end=now),
             }))
     dfs = data_fetcher.time_series.fetch_datapoints(
         ["constant3", "constant4"])
     self.assert_data_frame(dfs["constant3"], ["value"], {"value": 3})
     self.assert_data_frame(dfs["constant4"], ["value"], {"value": 4})
Exemplo n.º 8
0
    def predict(self, instance):
        """
        instance:
            Since we're doing scheduled prediction, this will be a data spec
            describing the data we should do prediction on.
        
        Note that it's also possible to take api_key and project in as
        optional arguments here.
        """
        dts = DataFetcher(instance)
        df = dts.time_series.fetch_dataframe(["temp", "pressure", "rpm"]).dropna()

        X = df[["temp", "pressure", "rpm"]].values
        df["production_rate"] = self.regressor.predict(X)

        # For scheduled prediction we need to return output on the format:
        # {
        #   "timeSeries":
        #      { "production_rate": [(t0, p0), (t1, p1), (t2, p2), ...] }
        # }
        # We can use a model hosting utilities method to convert our dataframe
        # to this format.
        return to_output(df[["timestamp", "production_rate"]])
Exemplo n.º 9
0
                       start=start,
                       end=end,
                       aggregate=aggregate,
                       granularity=granularity),
        "gas_integ_time":
        TimeSeriesSpec(4988486819178408,
                       start=start,
                       end=end,
                       aggregate=aggregate,
                       granularity=granularity),
        "gas_gain":
        TimeSeriesSpec(3658191334677419, start=start, end=end),
    })

# Now lets fetch the data for our "gas_auto" and "gas_external" time series
data_fetcher = DataFetcher(data_spec,
                           api_key=os.getenv("COGNITE_OID_API_KEY"),
                           project="publicdata")

df = data_fetcher.time_series.fetch_dataframe(["gas_auto", "gas_external"])

print(df.head())

# When using fetch_dataframe all specified time series must have the same start, end, and granularity
# To fetch data from times series with different specs, we can use the following method

dfs = data_fetcher.time_series.fetch_datapoints(["gas_integ_time", "gas_gain"])

print(dfs["gas_gain"].head())
print(dfs["gas_integ_time"].head())
Exemplo n.º 10
0
    def predict(self, instance):
        data_fetcher = DataFetcher(instance)
        df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"])
        df["y"] = df["x2"] / df["x1"]

        return to_output(df.size)
Exemplo n.º 11
0
 def predict(self, instance):
     data_fetcher = DataFetcher(instance)
     df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"])
     df["y"] = (df["x1"] + df["x2"]) / math.pi
     return to_output(df[["y", "timestamp"]])
Exemplo n.º 12
0
 def data_fetcher(self, file_specs, rsps):
     data_spec = DataSpec(files=file_specs)
     data_fetcher = DataFetcher(data_spec)
     return data_fetcher
Exemplo n.º 13
0
def test_get_data_spec():
    data_spec = DataSpec(files={"f1": FileSpec(id=123)})
    getted_data_spec = DataFetcher(data_spec).get_data_spec()
    getted_data_spec.files["f1"].id = 234
    assert data_spec.files["f1"].id == 123
Exemplo n.º 14
0
def test_invalid_spec_type():
    with pytest.raises(SpecValidationError, match="has to be of type"):
        DataFetcher(123)
Exemplo n.º 15
0
def test_empty_data_spec(rsps, data_spec):
    data_fetcher = DataFetcher(data_spec)
    assert data_fetcher.get_data_spec() == DataSpec()
Exemplo n.º 16
0
                       end=end,
                       aggregate=aggregate,
                       granularity=granularity),
        "gas_integ_time":
        TimeSeriesSpec(id=4988486819178408,
                       start=start,
                       end=end,
                       aggregate=aggregate,
                       granularity=granularity),
        "gas_gain":
        TimeSeriesSpec(id=3658191334677419, start=start, end=end),
    })

# Now lets fetch the data for our "gas_auto" and "gas_external" time series
data_fetcher = DataFetcher(data_spec,
                           api_key=os.getenv("COGNITE_OID_API_KEY"),
                           project="publicdata",
                           client_name="test-client")

df = data_fetcher.time_series.fetch_dataframe(["gas_auto", "gas_external"])

print(df.head())

# When using fetch_dataframe all specified time series must have the same start, end, and granularity
# To fetch data from times series with different specs, we can use the following method

dfs = data_fetcher.time_series.fetch_datapoints(["gas_integ_time", "gas_gain"])

print(dfs["gas_gain"].head())
print(dfs["gas_integ_time"].head())
Exemplo n.º 17
0
 def predict(self, instance):
     data_fetcher = DataFetcher(instance,
                                client_name="simple-transform-client")
     df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"])
     df["y"] = (df["x1"] + df["x2"]) / math.pi
     return to_output(df[["y"]])
Exemplo n.º 18
0
    def predict(self, instance):
        data_fetcher = DataFetcher(instance, client_name="cprfix-client")
        df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"])
        df["y"] = df["x2"] / df["x1"]

        return to_output(df[["y"]])