예제 #1
0
 def test_serialize_deserialize_is_invariant(self, input_dataframe):
     file_type = "json"
     pd = pytest.importorskip("pandas", reason="Pandas not installed")
     serializer = PandasSerializer(file_type)
     serialized = serializer.serialize(input_dataframe)
     deserialized = serializer.deserialize(serialized)
     pd.testing.assert_frame_equal(input_dataframe, deserialized)
예제 #2
0
 def test_serialize_kwargs_work_as_expected(self, input_dataframe):
     pd = pytest.importorskip("pandas", reason="Pandas not installed")
     serializer = PandasSerializer("csv",
                                   serialize_kwargs={
                                       "sep": ":",
                                       "index": False
                                   })
     serialized = serializer.serialize(input_dataframe)
     deserialized = serializer.deserialize(serialized)
     expected = pd.DataFrame({"one:two": ["1:4", "2:5", "3:6"]})
     pd.testing.assert_frame_equal(expected, deserialized)
예제 #3
0
 def test_deserialize_kwargs_work_as_expected(self, input_dataframe):
     pd = pytest.importorskip("pandas", reason="Pandas not installed")
     np = pytest.importorskip("numpy", reason="numpy not installed")
     serializer = PandasSerializer("csv",
                                   deserialize_kwargs={"na_values": [3, 5]})
     serialized = serializer.serialize(input_dataframe)
     deserialized = serializer.deserialize(serialized)
     expected = pd.DataFrame({
         "Unnamed: 0": [0, 1, 2],
         "one": [1, 2, np.nan],
         "two": [4, np.nan, 6]
     })
     pd.testing.assert_frame_equal(expected, deserialized)
예제 #4
0
 def test_serialize_returns_bytes(self, file_type, input_dataframe):
     pd = pytest.importorskip("pandas", reason="Pandas not installed")
     serialized = PandasSerializer(file_type).serialize(input_dataframe)
     assert isinstance(serialized, bytes)
예제 #5
0
 def test_complains_when_unavailable_file_type_specified(self):
     pd = pytest.importorskip("pandas", reason="Pandas not installed")
     with pytest.raises(ValueError):
         PandasSerializer("blerg")
예제 #6
0
def test_pandas_serializer_equality():
    pd = pytest.importorskip("pandas", reason="Pandas not installed")
    assert PickleSerializer() != PandasSerializer("csv")
    assert PandasSerializer("csv") == PandasSerializer("csv")
    assert PandasSerializer("csv",
                            serialize_kwargs={"one": 1}) == PandasSerializer(
                                "csv", serialize_kwargs={"one": 1})
    assert PandasSerializer("csv") != PandasSerializer("parquet")
    assert PandasSerializer("csv", deserialize_kwargs={
        "one": 1
    }) != PandasSerializer("csv", deserialize_kwargs={"one": 2})
    assert PandasSerializer("csv", serialize_kwargs={
        "one": 1
    }) != PandasSerializer("csv", serialize_kwargs={"one": 2})
예제 #7
0
def gen_data_pipeline() -> Flow:
    """Split the enire input set into build and holdout.

    Saves the ``build.csv`` and ``holdout.csv`` data to the ``models``
    subfolder within ``data_dir``.

    Parameters
    ----------
    None

    Returns
    -------
    Flow
        Generated pipeline.
    """
    # Initialize tasks
    load = LoadData(name="Load clean model data")
    format_data = SurvivalData(name="Convert input data to ranged form")
    segdata = SegmentData(name="Split data")
    retrieve_train = GetItem(
        name="Get training data",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/train.csv",
            serializer=PandasSerializer(file_type="csv",
                                        serialize_kwargs={"sep": "|"}),
        ),
    )
    retrieve_tune = GetItem(
        name="Get tuning data",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/tune.csv",
            serializer=PandasSerializer(file_type="csv",
                                        serialize_kwargs={"sep": "|"}),
        ),
    )
    retrieve_hold = GetItem(
        name="Get holdout data",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/holdout.csv",
            serializer=PandasSerializer(file_type="csv",
                                        serialize_kwargs={"sep": "|"}),
        ),
    )
    # Generate the flow
    with Flow(name="Split data into build and holdout") as flow:
        # Set up parameters
        data_dir = Parameter("data_dir", "nba-data")
        splits = Parameter("splits", [0.6, 0.2, 0.2])
        seed = Parameter("seed", 42)
        # Load the data
        basedata = load(data_dir=data_dir)
        # Format the data
        alldata = format_data(basedata)
        data = segdata(alldata,
                       splits=splits,
                       keys=["train", "tune", "holdout"],
                       seed=seed)
        _ = retrieve_train(task_result=data, key="train")
        _ = retrieve_tune(task_result=data, key="tune")
        _ = retrieve_hold(task_result=data, key="holdout")

    return flow
예제 #8
0
    end_date_string = timestamp_to_date_string(kwargs['end_date'])
    return f"d_{dataset}_w_{window_size}_o_{window_offset}_s_{start_date_string}_f_{end_date_string}.csv"


@task
def create_feature_store(dataset, window_size, window_offset):
    return FeatureStore(dataset,
                        window_size=window_size,
                        window_offset=window_offset)


@task(target=generate_task_run_target_name,
      checkpoint=True,
      result=LocalResult(dir="../../data/processed/prefect_results",
                         serializer=PandasSerializer(
                             file_type="csv",
                             serialize_kwargs={"index": False})))
def create_features(feature_store, start_date, end_date):
    feature_store.set_pointer(datetime.fromtimestamp(start_date))
    return feature_store.next_samples_until(datetime.fromtimestamp(end_date))


@task
def add_label_column(df, bots):
    df["isBot"] = df["IP"].isin(bots).astype(int)
    return df


def build_feature_pipeline_flow():
    with Flow("feature-pipeline") as feature_pipeline:
        dataset = Parameter("dataset")
예제 #9
0
from prefect import task, Flow, Parameter
from embedding import college_embeddings, college_facts, create_wiki, create_dict
from prefect.engine.results import LocalResult
from prefect.engine.serializers import PandasSerializer
with Flow("data analysis") as flow:
    """take all the python functions and feed them into prefect
    
        :returns: n/a
        :rtype: n/a
    """
    bypass = Parameter("bypass", default=False, required=False)


    @task(log_stdout=True, nout=4,
          result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="facts.csv"))
    college_fact = college_facts(bypass=bypass)
    @task(log_stdout=True, nout=4,
          result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="wiki_list.csv"))
    wiki_list = create_wiki(college_fact, bypass=bypass)
    @task(log_stdout=True, nout=4,
          result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="embedding_dict.csv"))
    dict = create_dict(wiki_list, bypass=bypass)
    @task(log_stdout=True, nout=4,
          result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="college_embeddings.csv"))
    college_embedding = college_embeddings(dict, wiki_list, college_fact, bypass=bypass)
#run functions
flow.register(project_name="college")
# LocalAgent().start()
flow.run(bypass=False)