def test_select_one_from_empty_rel_should_return_empty_if_not_keep_missing(): empty_relationship = Relationship(seed=1) selected = empty_relationship.select_one(from_ids=["non_existing"], discard_empty=True) assert selected.shape == (0, 2) assert selected.columns.tolist() == ["from", "to"]
def test_select_one_from_all_ids_should_return_one_line_per_id(): tested = Relationship(seed=1) tested.add_relations(from_ids=["a", "b", "b", "c"], to_ids=["b", "c", "a", "b"]) selected = tested.select_one() assert set(selected["from"].unique()) == {"a", "b", "c"}
def test_select_all_function_from_empty_relationship_should_return_empty(): empty_relationship = Relationship(seed=1) selected = empty_relationship.select_all_horizontal( from_ids=["non_existing"]) assert selected.shape == (0, 2) assert selected.columns.tolist() == ["from", "to"]
def test_select_one_from_empty_rel_should_return_none_if_keep_missing(): empty_relationship = Relationship(seed=1) selected = empty_relationship.select_one(from_ids=["non_existing"], discard_empty=False) assert selected.shape == (1, 2) assert selected.columns.tolist() == ["from", "to"] assert selected.iloc[0]["from"] == "non_existing" assert selected.iloc[0]["to"] is None
def test_select_one_nonexistingids_should_return_empty_if_not_keep_missing(): tested = Relationship(seed=1) tested.add_relations(from_ids=["a", "b", "b", "c"], to_ids=["b", "c", "a", "b"]) result = tested.select_one(["non_existing_id", "neither"], discard_empty=True) assert result.shape[0] == 0 assert result.columns.tolist() == ["from", "to"]
def test_weighted_relationship_should_take_weights_into_account(): # a,b and c are all connected to x,y and z, but the weight is 0 # everywhere except to y one_to_three_weighted = Relationship(seed=1234) one_to_three_weighted.add_relations(from_ids=["a"] * 3 + ["b"] * 3 + ["c"] * 3, to_ids=["x", "y", "z"] * 3, weights=[0, 1, 0] * 3) selected = one_to_three_weighted.select_one() # => with those weights, only x should should be selected assert selected["to"].tolist() == ["y", "y", "y"] assert sorted(selected["from"].tolist()) == ["a", "b", "c"]
def test_weighted_relationship_should_take_overridden_weights_into_account(): # a,b and c are all connected to x,y and z, but the weight is 0 # everywhere except to y one_to_three_weighted = Relationship(seed=1234) one_to_three_weighted.add_relations(from_ids=["a"] * 3 + ["b"] * 3 + ["c"] * 3, to_ids=["x", "y", "z"] * 3, weights=[0, 1, 0] * 3) # if we override the weight, we can only specify one value per "to" value overridden_to_weights = pd.Series(data=[0, 0, 1], index=["x", "y", "z"]) selected = one_to_three_weighted.select_one( overridden_to_weights=overridden_to_weights) # the initial weights should have been discarded and the one provided as # input should have been joined and used as expected assert selected["to"].tolist() == ["z", "z", "z"] assert sorted(selected["from"].tolist()) == ["a", "b", "c"]
def test_add_grouped(): story_data = pd.DataFrame({ "boxes": ["b1", "b2"], "fruits": [["f11", "f12", "f13", "f14"], ["f21", "f22", "f23", "f24"]], }) rel = Relationship(seed=1) ag = rel.ops.add_grouped(from_field="boxes", grouped_items_field="fruits") ag(story_data) # we should have 4 relationships from b1 and from b2 assert rel.get_relations(from_ids=["b1"])["from"].tolist() == [ "b1", "b1", "b1", "b1" ] assert rel.get_relations(from_ids=["b2"])["from"].tolist() == [ "b2", "b2", "b2", "b2" ] # pointing to each of the values above assert rel.get_relations(from_ids=["b1"])["to"].tolist() == [ "f11", "f12", "f13", "f14" ] assert rel.get_relations(from_ids=["b2"])["to"].tolist() == [ "f21", "f22", "f23", "f24" ]
def create_relationship(self, name, seed=None): """ creates an empty relationship from the members of this population """ if name is self.relationships: raise ValueError("cannot create a second relationship with " "existing name {}".format(name)) self.relationships[name] = Relationship( seed=seed if seed else next(self.circus.seeder)) return self.relationships[name]
def test_select_one_nonexistingids_should_insert_none_if_keep_missing(): tested = Relationship(seed=1) tested.add_relations(from_ids=["a", "b", "b", "c"], to_ids=["a1", "b1", "b2", "c1"]) result = tested.select_one(["c", "b_non_existing_id", "a", "neither", "a"], discard_empty=False) assert result.shape[0] == 5 assert result.columns.tolist() == ["from", "to"] result_s = result.sort_values("from") assert result_s["from"].tolist() == [ "a", "a", "b_non_existing_id", "c", "neither" ] assert result_s["to"].tolist() == [ "a1", "a1", None, "c1", None, ]
def test_select_many_with_drop_should_remove_elements(): story_data_index = build_ids(5, prefix="cl_", max_length=1) # makes a copy since we're going to drop some elements four_to_plenty_copy = Relationship(seed=1) for i in range(100): four_to_plenty_copy.add_relations( from_ids=["a", "b", "c", "d"], to_ids=["a_%d" % i, "b_%d" % i, "c_%d" % i, "d_%d" % i]) selection = four_to_plenty.select_many(from_ids=pd.Series( ["a", "b", "c", "b", "a"], index=story_data_index), named_as="selected_sets", quantities=[4, 5, 6, 7, 8], remove_selected=True, discard_empty=False) # makes sure all selected values have been removed for from_id in selection.index: for to_id in selection.ix[from_id]["selected_sets"].tolist(): rels = four_to_plenty_copy.get_relations(from_ids=[from_id]) assert to_id not in rels["to"]
def test_io_round_trip(): with path.tempdir() as p: full_path = os.path.join(p, "relationship.csv") four_to_plenty.save_to(full_path) retrieved = Relationship.load_from(full_path) assert four_to_plenty.seed == retrieved.seed assert four_to_plenty.unique_tos() == retrieved.unique_tos() assert four_to_plenty.grouped.keys() == retrieved.grouped.keys() expected_relations = four_to_plenty.get_relations().sort_values( ["from", "to"]).reset_index() actual_relations = retrieved.get_relations().sort_values( ["from", "to"]).reset_index() assert expected_relations["from"].equals(actual_relations["from"]) assert expected_relations["to"].equals(actual_relations["to"]) assert expected_relations["weight"].equals(actual_relations["weight"])
def load_from(folder, circus): """ Reads all persistent data of this population and loads it :param folder: folder containing all CSV files of this population :param circus: parent circus containing this population :return: """ ids_path = os.path.join(folder, "ids.csv") ids = pd.read_csv(ids_path, index_col=0, names=[]).index attribute_dir = os.path.join(folder, "attributes") if os.path.exists(attribute_dir): attributes = { filename[:-4]: Attribute.load_from(os.path.join(attribute_dir, filename)) for filename in os.listdir(attribute_dir) } else: attributes = {} relationships_dir = os.path.join(folder, "relationships") if os.path.exists(relationships_dir): relationships = { filename[:-4]: Relationship.load_from( os.path.join(relationships_dir, filename)) for filename in os.listdir(relationships_dir) } else: relationships = {} population = Population(circus=circus, size=0) population.attributes = attributes population.relationships = relationships population.ids = ids population.size = len(ids) return population
def test_pop_one_relationship_should_remove_element(): # we're removing relations from this one => working on a copy not to # influence other tests oneto1_copy = Relationship(seed=1) oneto1_copy.add_relations(from_ids=["a", "b", "c", "d", "e"], to_ids=["ta", "tb", "tc", "td", "te"]) selected = oneto1_copy.select_one(from_ids=["a", "d"], remove_selected=True) # unique "to" value should have been taken assert selected.sort_values("from")["to"].tolist() == ["ta", "td"] assert selected.columns.tolist() == ["from", "to"] # and removed form the relationship assert set(oneto1_copy.grouped.keys()) == {"b", "c", "e"} # selecting the same again should just return nothing selected = oneto1_copy.select_one(from_ids=["a", "d"], remove_selected=True) assert selected.shape[0] == 0 assert selected.columns.tolist() == ["from", "to"] # and have no impact on the relationship assert set(oneto1_copy.grouped.keys()) == {"b", "c", "e"} # selecting the same again without discarding empty relationship should # now return a size 2 dataframe with Nones selected = oneto1_copy.select_one(from_ids=["a", "d"], remove_selected=True, discard_empty=False) assert selected.shape[0] == 2 assert sorted(selected.columns.tolist()) == ["from", "to"] assert selected["to"].tolist() == [None, None] assert sorted(selected["from"].tolist()) == ["a", "d"]
def test_seeded_relationship_should_always_return_same_selection(): from_ids = ["a", "a", "a", "b", "b", "b", "c", "c", "c"] to_ids = ["af1", "af2", "af3", "bf1", "bf2", "bf3", "cf1", "cf2", "cf3"] # two relationship seeded identically tested1 = Relationship(seed=1345) tested2 = Relationship(seed=1345) tested1.add_relations(from_ids=from_ids, to_ids=to_ids) tested2.add_relations(from_ids=from_ids, to_ids=to_ids) assert tested1.select_one(from_ids=["a"]).equals( tested2.select_one(from_ids=["a"])) assert tested1.select_one(from_ids=["b"]).equals( tested2.select_one(from_ids=["b"])) assert tested1.select_one(from_ids=["a", "b", "d"]).equals( tested2.select_one(from_ids=["a", "b", "d"]))
import path import pandas as pd import logging import os import numpy as np import functools from trumania.core.util_functions import setup_logging from trumania.core.util_functions import build_ids from trumania.core.relationship import Relationship setup_logging() oneto1 = Relationship(seed=1) oneto1.add_relations(from_ids=pd.Series(["a", "b", "c", "d", "e"]), to_ids=pd.Series(["ta", "tb", "tc", "td", "te"])) four_to_one = Relationship(seed=1) four_to_one.add_relations(from_ids=pd.Series(["a", "b", "c", "d"]), to_ids=pd.Series(["z", "z", "z", "z"])) four_to_two = Relationship(seed=1) four_to_two.add_relations(from_ids=pd.Series(["a", "b", "c", "d"]), to_ids=pd.Series(["y", "y", "y", "y"])) four_to_two.add_relations(from_ids=pd.Series(["a", "b", "c", "d"]), to_ids=pd.Series(["z", "z", "z", "z"])) two_per_from = Relationship(seed=1) two_per_from.add_relations(from_ids=pd.Series(["a", "b", "c", "d"]), to_ids=pd.Series(["ya", "yb", "yc", "yd"])) two_per_from.add_relations(from_ids=pd.Series(["a", "b", "c", "d"]),
def test_select_many_several_times_with_pop_should_empty_all_data(): rel = Relationship(seed=1234) froms = ["id1"] * 2500 + ["id2"] * 1500 + ["id3"] * 500 tos = np.random.choice(a=range(10), size=len(froms)) rel.add_relations(from_ids=froms, to_ids=tos) assert rel.get_relations().shape[0] == 2500 + 1500 + 500 # we'll be selecting 1000 values from all 3 ids, 3 times # first selection: we should be able to get some values out, though id3 # should already be exhausted selection1 = rel.select_many(from_ids=pd.Series(["id1", "id2", "id3"], index=["f1", "f2", "f3"]), named_as="the_selection", quantities=[1000, 1000, 1000], remove_selected=True, discard_empty=False) assert selection1.columns.tolist() == ["the_selection"] assert sorted(selection1.index.tolist()) == ["f1", "f2", "f3"] # only 500 could be obtained from "id3": selection_sizes1 = selection1["the_selection"].map(len) assert selection_sizes1[["f1", "f2", "f3"]].tolist() == [1000, 1000, 500] # remove_selected => size of the relationship should have decreased assert rel.get_relations().shape[0] == 1500 + 500 + 0 # second selection: similar story for id2 as for id3, plus now id3 should # just return an empty list (since discard_empty is False) selection2 = rel.select_many(from_ids=pd.Series(["id1", "id2", "id3"], index=["f1", "f2", "f3"]), named_as="the_selection", quantities=[1000, 1000, 1000], remove_selected=True, discard_empty=False) assert selection2.columns.tolist() == ["the_selection"] assert sorted(selection2.index.tolist()) == ["f1", "f2", "f3"] # only 500 could be obtained from "id2" and nothing from "id2": selection_sizes2 = selection2["the_selection"].map(len) assert selection_sizes2[["f1", "f2", "f3"]].tolist() == [1000, 500, 0] # remove_selected => size of the relationship should have decreased assert rel.get_relations().shape[0] == 500 + 0 + 0 # third selection: should be very simlar to above selection3 = rel.select_many(from_ids=pd.Series(["id1", "id2", "id3"], index=["f1", "f2", "f3"]), named_as="the_selection", quantities=[1000, 1000, 1000], remove_selected=True, discard_empty=False) assert selection3.columns.tolist() == ["the_selection"] assert sorted(selection3.index.tolist()) == ["f1", "f2", "f3"] selection_sizes3 = selection3["the_selection"].map(len) assert selection_sizes3[["f1", "f2", "f3"]].tolist() == [500, 0, 0] # the relationship should now be empty assert rel.get_relations().shape[0] == 0 + 0 + 0 # one last time: selection from a fully empty relationship # third selection: should be very similar to above selection4 = rel.select_many(from_ids=pd.Series(["id1", "id2", "id3"], index=["f1", "f2", "f3"]), named_as="the_selection", quantities=[1000, 1000, 1000], remove_selected=True, discard_empty=False) assert selection4.columns.tolist() == ["the_selection"] assert sorted(selection4.index.tolist()) == ["f1", "f2", "f3"] selection_sizes4 = selection4["the_selection"].map(len) assert selection_sizes4[["f1", "f2", "f3"]].tolist() == [0, 0, 0] # relationship should still be empty assert rel.get_relations().shape[0] == 0
def test_select_one_from_empty_relationship_should_return_void(): tested = Relationship(seed=1) result = tested.select_one(pd.Series([])) assert result.shape[0] == 0 assert result.columns.tolist() == ["from", "to"]