示例#1
0
def test_select_one_from_many_times_same_id_should_yield_different_results():

    op = four_to_plenty.ops.select_one(from_field="DEALER",
                                       named_as="SIM",
                                       one_to_one=True)

    # Many customer selected the same dealer and want to get a sim from them.
    # We expect each of the 2 selected dealer to sell a different SIM to each
    story_data = pd.DataFrame(
        {
            "DEALER": ["a", "a", "b", "a", "b", "a", "b", "a", "a", "a"],
        },
        index=build_ids(size=10, prefix="c", max_length=2))

    result, logs = op(story_data)
    logging.info("selected")

    assert {} == logs
    assert ["DEALER", "SIM"] == result.columns.tolist()

    # There could be collisions that reduce the same of the resulting index,
    # but there should not be only collisions, leading to only "a" and "b"
    assert result.shape[0] > 3

    g = result.groupby("DEALER")["SIM"]

    assert len(np.unique(g.get_group("a").values)) > 1
    assert len(np.unique(g.get_group("b").values)) > 1
def test_make_random_assign_shoud_assign_each_element_only_once():

    dealers = build_ids(size=10, prefix="DEALER_", max_length=2)
    sims = build_ids(size=1000, prefix="SIM_", max_length=4)

    assignment = make_random_assign(set1=sims, set2=dealers, seed=10)

    # all sims should have been assigned
    assert assignment.shape == (1000, 2)

    # all SIM should have been given
    assert set(assignment["set1"].unique().tolist()) == set(sims)

    # all owners should be part of the dealers
    assert set(
        assignment["chosen_from_set2"].unique().tolist()) <= set(dealers)
示例#3
0
    def create_dealers_and_sims_stock(self):
        """
        Create the DEALER population together with their init SIM stock
        """
        logging.info("Creating dealer and their SIM stock  ")

        dealers = self.create_population(name="dealers",
                                         size=params["n_dealers"],
                                         ids_gen=SequencialGenerator(
                                             prefix="DEALER_", max_length=3))

        # SIM relationship to maintain some stock
        sims = dealers.create_relationship(name="SIM")
        sim_ids = build_ids(size=params["n_init_sims_dealer"], prefix="SIM_")
        sims_dealer = make_random_assign(set1=sim_ids,
                                         set2=dealers.ids,
                                         seed=next(self.seeder))
        sims.add_relations(from_ids=sims_dealer["chosen_from_set2"],
                           to_ids=sims_dealer["set1"])

        # one more dealer with just 3 sims in stock => this one will trigger
        # lot's of failed sales
        broken_dealer = pd.DataFrame({
            "DEALER":
            "broke_dealer",
            "SIM": ["SIM_OF_BROKE_DEALER_%d" % s for s in range(3)]
        })

        sims.add_relations(from_ids=broken_dealer["DEALER"],
                           to_ids=broken_dealer["SIM"])

        return dealers
示例#4
0
    def generate(self, size):
        # forcing size as int, also making sure we never get floating point
        # values in ids (can happen if size results from some scaling)
        size_i = int(size)
#        size_i = size
        values = build_ids(size_i, self.counter, self.prefix, self.max_length)
        self.counter += size_i
        return values
示例#5
0
def test_select_many_operation_should_join_subsets_of_relationships():
    # same test as above, but from the operation

    story_data = pd.DataFrame(
        {
            "let": ["a", "b", "c", "b", "a"],
            "how_many": [4, 5, 6, 7, 8]
        },
        index=build_ids(5, prefix="wh_", max_length=2))

    select_op = four_to_plenty.ops.select_many(
        from_field="let",
        named_as="found",
        pop=False,
        quantity_field="how_many",
        discard_missing=False,
    )

    selection, logs = select_op(story_data)

    # this index is expected among other things since it allows a direct
    # merge into the initial request
    assert selection.sort_index().index.equals(story_data.sort_index().index)

    assert selection.columns.tolist() == ["how_many", "let", "found"]

    # no capping should have occurred: four_to_plenty has largely enough
    assert selection["found"].apply(len).tolist() == [4, 5, 6, 7, 8]

    # every chosen element should be present at most once
    s = functools.reduce(lambda s1, s2: set(s1) | set(s2), selection["found"])
    assert len(s) == np.sum([4, 5, 6, 7, 8])

    # all relationships in wh00 must come from a
    a_tos = four_to_plenty.get_relations(["a"])["to"]
    for f in selection.loc["wh_00", "found"]:
        assert f in a_tos.values
    for f in selection.loc["wh_04", "found"]:
        assert f in a_tos.values

    b_tos = four_to_plenty.get_relations(["b"])["to"]
    for f in selection.loc["wh_01", "found"]:
        assert f in b_tos.values
    for f in selection.loc["wh_03", "found"]:
        assert f in b_tos.values

    c_tos = four_to_plenty.get_relations(["c"])["to"]
    for f in selection.loc["wh_02", "found"]:
        assert f in c_tos.values
示例#6
0
def test_drop_should_remove_the_rows_where_condition_is_true_():
    cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
    cdrs.index = build_ids(12, prefix="ix_", max_length=2)
    cdrs["cond"] = ([True] * 3 + [False] * 3) * 2

    rem = operations.DropRow(condition_field="cond")
    story_data, all_logs = rem(cdrs)

    kept_index = ["ix_03", "ix_04", "ix_05", "ix_09", "ix_10", "ix_11"]

    # 6 rows should have been removed
    assert story_data.shape == (6, 4)
    assert story_data.columns.tolist() == ["A", "B", "duration", "cond"]
    assert story_data["A"].equals(cdrs.loc[kept_index]["A"])
    assert story_data["B"].equals(cdrs.loc[kept_index]["B"])
    assert story_data["duration"].equals(cdrs.loc[kept_index]["duration"])
示例#7
0
def test_select_many_should_return_subsets_of_relationships():

    story_data_index = build_ids(5, prefix="cl_", max_length=1)

    # cheating with the seed for the second part of the test
    four_to_plenty.state = np.random.RandomState(18)
    selection = four_to_plenty.select_many(
        from_ids=pd.Series(["a", "b", "c", "b", "a"], index=story_data_index),
        named_as="selected_sets",

        # On purpose requesting non-integer quantities => these should be
        # rounded to int. It's very common to have them in practise, typically
        # when generating "bulk size" out of a non-integer distribution
        quantities=[4, 5, 6.5, 7.5, 8],
        remove_selected=False,
        discard_empty=False)

    # this index is expected among other things since it allows a direct
    # merge into the initial request
    assert sorted(selection.index.tolist()) == story_data_index
    assert selection.columns.tolist() == ["selected_sets"]

    # no capping should have occured: four_to_plenty has largely enough
    assert sorted(
        selection["selected_sets"].apply(len).tolist()) == [4, 5, 6, 7, 8]

    # every chosen elemnt should be persent at most once
    s = functools.reduce(lambda s1, s2: set(s1) | set(s2),
                         selection["selected_sets"])
    assert len(s) == np.sum([4, 5, 6, 7, 8])

    # selecting the same thing => should return the same result since
    # remove_selected is False and the relationship is seeded
    four_to_plenty.state = np.random.RandomState(18)
    selection_again = four_to_plenty.select_many(from_ids=pd.Series(
        ["a", "b", "c", "b", "a"], index=story_data_index),
                                                 named_as="selected_sets",
                                                 quantities=[4, 5, 6, 7, 8],
                                                 remove_selected=False,
                                                 discard_empty=False)

    assert selection.sort_index().index.equals(
        selection_again.sort_index().index)
    for idx in selection.index:
        assert selection.ix[idx]["selected_sets"].tolist(
        ) == selection_again.ix[idx]["selected_sets"].tolist()
示例#8
0
def test_select_many_with_drop_should_remove_elements():

    story_data_index = build_ids(5, prefix="cl_", max_length=1)

    # makes a copy since we're going to drop some elements
    four_to_plenty_copy = Relationship(seed=1)
    for i in range(100):
        four_to_plenty_copy.add_relations(
            from_ids=["a", "b", "c", "d"],
            to_ids=["a_%d" % i, "b_%d" % i,
                    "c_%d" % i, "d_%d" % i])

    selection = four_to_plenty.select_many(from_ids=pd.Series(
        ["a", "b", "c", "b", "a"], index=story_data_index),
                                           named_as="selected_sets",
                                           quantities=[4, 5, 6, 7, 8],
                                           remove_selected=True,
                                           discard_empty=False)

    # makes sure all selected values have been removed
    for from_id in selection.index:
        for to_id in selection.ix[from_id]["selected_sets"].tolist():
            rels = four_to_plenty_copy.get_relations(from_ids=[from_id])
            assert to_id not in rels["to"]
示例#9
0
    def create_subs_and_sims(self):
        """
        Creates the subs and sims + a relationship between them + an agent
        relationship.

        We have at least one sim per subs: sims.size >= subs.size

        The sims population contains the "OPERATOR", "MAIN_ACCT" and "MSISDN" attributes.

        The subs population has a "SIMS" relationship that points to the sims owned by
        each subs.

        The sims population also has a relationship to the set of agents where this sim
        can be topped up.
        """

        npgen = RandomState(seed=next(self.seeder))

        # subs are empty here but will receive a "CELLS" and "EXCITABILITY"
        # attributes later on
        subs = self.create_population(
            name="subs",
            size=self.params["n_subscribers"],
            ids_gen=SequencialGenerator(prefix="SUBS_"))

        number_of_operators = npgen.choice(a=range(1, 5), size=subs.size)
        operator_ids = build_ids(size=4, prefix="OPERATOR_", max_length=1)

        def pick_operators(qty):
            """
            randomly choose a set of unique operators of specified size
            """
            return npgen.choice(a=operator_ids,
                                p=[.8, .05, .1, .05],
                                size=qty,
                                replace=False).tolist()

        # set of operators of each subs
        subs_operators_list = map(pick_operators, number_of_operators)

        # Dataframe with 4 columns for the 1rst, 2nd,... operator of each subs.
        # Since subs_operators_list don't all have the size, some entries of this
        # dataframe contains None, which are just discarded by the stack() below
        subs_operators_df = pd.DataFrame(data=list(subs_operators_list),
                                         index=subs.ids)

        # same info, vertically: the index contains the sub id (with duplicates)
        # and "operator" one of the operators of this subs
        subs_ops_mapping = subs_operators_df.stack()
        subs_ops_mapping.index = subs_ops_mapping.index.droplevel(level=1)

        # SIM population, each with an OPERATOR and MAIN_ACCT attributes
        sims = self.create_population(
            name="sims",
            size=subs_ops_mapping.size,
            ids_gen=SequencialGenerator(prefix="SIMS_"))
        sims.create_attribute("OPERATOR", init_values=subs_ops_mapping.values)
        recharge_gen = ConstantGenerator(value=1000.)
        sims.create_attribute(name="MAIN_ACCT", init_gen=recharge_gen)

        # keeping track of the link between population and sims as a relationship
        sims_of_subs = subs.create_relationship("SIMS")
        sims_of_subs.add_relations(from_ids=subs_ops_mapping.index,
                                   to_ids=sims.ids)

        msisdn_gen = MSISDNGenerator(
            countrycode="0032",
            prefix_list=["472", "473", "475", "476", "477", "478", "479"],
            length=6,
            seed=next(self.seeder))
        sims.create_attribute(name="MSISDN", init_gen=msisdn_gen)

        # Finally, adding one more relationship that defines the set of possible
        # shops where we can topup each SIM.
        # TODO: to make this a bit more realistic, we should probably generate
        # such relationship first from the subs to their favourite shops, and then
        # copy that info to each SIM, maybe with some fluctuations to account
        # for the fact that not all shops provide topups of all operators.
        agents = build_ids(self.params["n_agents"],
                           prefix="AGENT_",
                           max_length=3)

        agent_df = pd.DataFrame.from_records(make_random_bipartite_data(
            sims.ids, agents, 0.3, seed=next(self.seeder)),
                                             columns=["SIM_ID", "AGENT"])

        logging.info(" creating random sim/agent relationship ")
        sims_agents_rel = sims.create_relationship("POSSIBLE_AGENTS")

        agent_weight_gen = NumpyRandomGenerator(method="exponential",
                                                scale=1.,
                                                seed=next(self.seeder))

        sims_agents_rel.add_relations(from_ids=agent_df["SIM_ID"],
                                      to_ids=agent_df["AGENT"],
                                      weights=agent_weight_gen.generate(
                                          agent_df.shape[0]))

        return subs, sims, recharge_gen