def add_periodic_restock_story_with_combined_generator(the_circus): """ This is a variation of add_periodic_restock_story that shows how to obtain the same result by plugging generators into each other instead of explicitly generating intermediary fields in the story_data. """ pos = the_circus.populations["point_of_sale"] # using this timer means POS are more likely to trigger a re-stock during # day hours rather that at night. timer_gen = profilers.DefaultDailyTimerGenerator( clock=the_circus.clock, seed=next(the_circus.seeder)) restock_story = the_circus.create_story( name="restock", initiating_population=pos, member_id_field="POS_ID", timer_gen=timer_gen, # Using a ConstantGenerator here means each POS will have the same # activity level of exactly one story per day on average. Since # the time itself is random, period between 2 restocks will on # general not be exactly 7days activity_gen=gen.ConstantGenerator(value=timer_gen.activity( n=1, per=pd.Timedelta("7 days") )), ) stock_size_gen = gen.NumpyRandomGenerator(method="choice", a=[5, 15, 20, 25], p=[0.1, 0.2, 0.5, 0.2], seed=next(the_circus.seeder)) item_bulk_gen = stock_size_gen.flatmap( gen.DependentBulkGenerator( element_generator=the_circus.generators["items_gen"]) ) restock_story.set_operations( the_circus.clock.ops.timestamp(named_as="TIME", log_format="%Y-%m-%d"), # include the POS NAME attribute as a field name "POS_NAME" pos.ops.lookup(id_field="POS_ID", select={"NAME": "POS_NAME"}), # stock_size_gen.ops.generate(named_as="RESTOCK_VOLUME"), item_bulk_gen.ops.generate(named_as="NEW_ITEM_IDS"), pos.get_relationship("items").ops.add_grouped(from_field="POS_ID", grouped_items_field="NEW_ITEM_IDS"), ops.FieldLogger(log_id="restock", cols=["TIME", "POS_ID", "POS_NAME"]) )
def add_user_preference_story(the_circus): users = the_circus.populations["users"] timer_gen = gen.ConstantDependentGenerator( value=the_circus.clock.n_iterations(duration=pd.Timedelta("24h")) - 1) # # using this timer means users only listen to songs during work hours # timer_gen = profilers.WorkHoursTimerGenerator( # clock=the_circus.clock, seed=next(the_circus.seeder)) # # this generate activity level distributed as a "truncated normal # # distribution", i.e. very high and low activities are prevented. # bounded_gaussian_activity_gen = gen.NumpyRandomGenerator( # method="normal", # seed=next(the_circus.seeder), # loc=timer_gen.activity(n=1, # per=pd.Timedelta("1 day")), # scale=1 # ).map(ops.bound_value(lb=10, ub=20)) prefer = the_circus.create_story( name="prefer_events", initiating_population=users, member_id_field="UID", timer_gen=timer_gen, # activity_gen=bounded_gaussian_activity_gen ) repo = the_circus.populations["products"] prefer.set_operations( users.ops.lookup(id_field="UID", select={ "FIRST_NAME": "USER_FIRST_NAME", "LAST_NAME": "USER_LAST_NAME", }), # Add user preference value gen.NumpyRandomGenerator( method="uniform", low=-1, high=1, seed=next(the_circus.seeder)).ops.generate(named_as="PREFERENCE"), # Picks a product at random repo.ops.select_one(named_as="PRODUCT_ID"), # Add timestamp column the_circus.clock.ops.timestamp(named_as="DATETIME"), ops.FieldLogger("prefer_events"))
def add_inactive_restock_story(the_circus): """ This is a copy-paste of add_periodic_restock_story(), but without the timer nor the activity levels => as-is, this story never triggers """ pos = the_circus.populations["point_of_sale"] restock_story = the_circus.create_story( name="restock", initiating_population=pos, member_id_field="POS_ID") stock_size_gen = gen.NumpyRandomGenerator(method="choice", a=[5, 15, 20, 25], p=[0.1, 0.2, 0.5, 0.2], seed=next(the_circus.seeder)) item_bulk_gen = gen.DependentBulkGenerator( element_generator=the_circus.generators["items_gen"]) restock_story.set_operations( the_circus.clock.ops.timestamp(named_as="TIME"), # include the POS NAME attribute as a field name "POS_NAME" pos.ops.lookup(id_field="POS_ID", select={"NAME": "POS_NAME"}), pos.get_relationship("items").ops.get_neighbourhood_size( from_field="POS_ID", named_as="PREV_STOCK_LEVEL"), stock_size_gen.ops.generate(named_as="RESTOCK_VOLUME"), item_bulk_gen.ops.generate(named_as="NEW_ITEM_IDS", observed_field="RESTOCK_VOLUME"), pos.get_relationship("items").ops.add_grouped(from_field="POS_ID", grouped_items_field="NEW_ITEM_IDS"), pos.get_relationship("items").ops.get_neighbourhood_size( from_field="POS_ID", named_as="NEW_STOCK_LEVEL"), ops.FieldLogger(log_id="restock", cols=["TIME", "POS_ID", "POS_NAME", "RESTOCK_VOLUME", "PREV_STOCK_LEVEL", "NEW_STOCK_LEVEL"]) )
def select_one(self, named_as): """ Appends a field column to the story_data containing member ids taken at random among the ids of this population. This is similar to relationship_select_one(), except that no particular relation is required, we just sample one id randomly :param named_as: the name of the field added to the story_data """ gen = random_generators.NumpyRandomGenerator( method="choice", a=self.population.ids, seed=next(self.population.circus.seeder)) return gen.ops.generate(named_as=named_as)
def add_listen_story(the_circus): users = the_circus.populations["user"] # using this timer means users only listen to songs during work hours timer_gen = profilers.WorkHoursTimerGenerator(clock=the_circus.clock, seed=next(the_circus.seeder)) # this generate activity level distributed as a "truncated normal # distribution", i.e. very high and low activities are prevented. bounded_gaussian_activity_gen = gen.NumpyRandomGenerator( method="normal", seed=next(the_circus.seeder), loc=timer_gen.activity(n=20, per=pd.Timedelta("1 day")), scale=5).map(ops.bound_value(lb=10, ub=30)) listen = the_circus.create_story( name="listen_events", initiating_population=users, member_id_field="UID", timer_gen=timer_gen, activity_gen=bounded_gaussian_activity_gen) repo = the_circus.populations["music_repository"] listen.set_operations( users.ops.lookup(id_field="UID", select={ "FIRST_NAME": "USER_FIRST_NAME", "LAST_NAME": "USER_LAST_NAME", }), # picks a genre at random repo.ops.select_one(named_as="GENRE"), # picks a song at random for that genre repo.get_relationship("songs").ops.select_one(from_field="GENRE", named_as="SONG_ID"), ops.FieldLogger("events"))
def select_one(self, named_as, weight_attribute_name=None): """ Appends a field column to the story_data containing member ids taken at random among the ids of this population. This is similar to relationship_select_one(), except that no particular relation is required. It will select one randomly by default, but a weight attribute name can be provided as well to give a weight to your selection. :param named_as: the name of the field added to the story_data :param weight_attribute_name: the attribute name which contains the weights you want to use for the selection """ p = None if weight_attribute_name: attributes = self.population.get_attribute(weight_attribute_name).get_values() if np.any(attributes < 0): raise ValueError( "weight_attribute_name contain negative values: cannot use that as weight") normalization_factor = attributes.sum() if normalization_factor == 0: raise ValueError("weight_attribute_name in population select.one sum up to zero: cannot use that as weight") p = attributes / attributes.sum() gen = random_generators.NumpyRandomGenerator( method="choice", a=self.population.ids, p=p, seed=next(self.population.circus.seeder)) return gen.ops.generate(named_as=named_as)
def add_listen_and_share_stories_with_details(the_circus): """ This is again a copy-paste of add_listen_and_share_stories_with_details, (hopefully this helps to illustrate the progression), here showing the supplementary look-up on the attributes of the songs """ users = the_circus.populations["user"] # using this timer means users only listen to songs during work hours timer_gen = profilers.WorkHoursTimerGenerator(clock=the_circus.clock, seed=next(the_circus.seeder)) # this generate activity level distributed as a "truncated normal # distribution", i.e. very high and low activities are prevented. bounded_gaussian_activity_gen = gen.NumpyRandomGenerator( method="normal", seed=next(the_circus.seeder), loc=timer_gen.activity(n=20, per=pd.Timedelta("1 day")), scale=5).map(ops.bound_value(lb=10, ub=30)) listen = the_circus.create_story( name="listen_events", initiating_population=users, member_id_field="UID", timer_gen=timer_gen, activity_gen=bounded_gaussian_activity_gen) share = the_circus.create_story(name="share_events", initiating_population=users, member_id_field="UID", timer_gen=timer_gen, activity_gen=bounded_gaussian_activity_gen) repo = the_circus.populations["music_repository"] songs = the_circus.populations["song"] select_genre_and_song = ops.Chain( users.ops.lookup(id_field="UID", select={ "FIRST_NAME": "USER_FIRST_NAME", "LAST_NAME": "USER_LAST_NAME", }), # picks a genre at random repo.ops.select_one(named_as="GENRE"), # picks a song at random for that genre repo.get_relationship("songs").ops.select_one(from_field="GENRE", named_as="SONG_ID"), # now also reporting details of listened or shared songs songs.ops.lookup(id_field="SONG_ID", select={ "artist_name": "SONG_ARTIST", "title": "SONG_TITLE", "recording_year": "SONG_YEAR", "duration_seconds": "SONG_DURATION", }), ) listen.set_operations(select_genre_and_song, ops.FieldLogger("listen_events")) share.set_operations( select_genre_and_song, # picks a user this song is shared to users.ops.select_one(named_as="SHARED_TO_UID"), # note we could post-check when user shared a song to their own uid # here, in which case we can use DropRow to discard that share event ops.FieldLogger("share_events"))
def add_song_populations(the_circus): songs = the_circus.create_population( name="song", size=0, ids_gen=gen.SequencialGenerator(prefix="SONG_")) # since the size of the population is 0, we can create attribute without # providing any initialization songs.create_attribute(name="artist_name") songs.create_attribute(name="song_genre") songs.create_attribute(name="title") songs.create_attribute(name="duration_seconds") songs.create_attribute(name="recording_year") song_id_gen = gen.SequencialGenerator(prefix="S_") # generate artist names from a list of randomly generated ones, so we have # some redundancy in the generated dataset artist_name_gen = gen.NumpyRandomGenerator( method="choice", a=gen.FakerGenerator(method="name", seed=next(the_circus.seeder)).generate(size=200), seed=next(the_circus.seeder)) title_gen = gen.FakerGenerator(method="sentence", seed=next(the_circus.seeder), nb_words=4, variable_nb_words=True) # generates recording years within a desired date range year_gen = gen.FakerGenerator( method="date_time_between_dates", seed=next(the_circus.seeder), datetime_start=pd.Timestamp("1910-10-20"), datetime_end=pd.Timestamp("2016-12-02")) \ .map(f=lambda d: d.year) duration_gen = gen.ParetoGenerator(xmin=60, seed=next(the_circus.seeder), force_int=True, a=1.2) repo = the_circus.populations["music_repository"] repo_genre_rel = repo.get_attribute("genre_name") for genre_id, genre_name in repo_genre_rel.get_values().items(): # an operation capable of creating songs of that genre init_attribute = ops.Chain( artist_name_gen.ops.generate(named_as="artist_name"), title_gen.ops.generate(named_as="title"), year_gen.ops.generate(named_as="recording_year"), duration_gen.ops.generate(named_as="duration_seconds"), gen.ConstantGenerator(value=genre_name).ops.generate( named_as="song_genre")) # dataframe of emtpy songs: just with one SONG_ID column for now song_ids = song_id_gen.generate(size=1000) emtpy_songs = story.Story.init_story_data( member_id_field_name="SONG_ID", active_ids=song_ids) # we can already adds the generated songs to the music repo relationship repo.get_relationship("songs").add_grouped_relations( from_ids=[genre_id], grouped_ids=[song_ids]) # here we generate all desired columns in the dataframe initialized_songs, _ = init_attribute(emtpy_songs) initialized_songs.drop(["SONG_ID"], axis=1, inplace=True) # this works because the columns of init_attribute match exactly the # ones of the attributes of the populations songs.update(initialized_songs) # makes sure year and duration are handled as integer songs.get_attribute("recording_year").transform_inplace(int) songs.get_attribute("duration_seconds").transform_inplace(int)
def add_listen_and_share_stories(the_circus): """ This is essentially a copy-paste of add_listen_story, + the update for the share story, in order to show the Chain re-usability clearly """ users = the_circus.populations["user"] # using this timer means users only listen to songs during work hours timer_gen = profilers.WorkHoursTimerGenerator(clock=the_circus.clock, seed=next(the_circus.seeder)) # this generate activity level distributed as a "truncated normal # distribution", i.e. very high and low activities are prevented. bounded_gaussian_activity_gen = gen.NumpyRandomGenerator( method="normal", seed=next(the_circus.seeder), loc=timer_gen.activity(n=20, per=pd.Timedelta("1 day")), scale=5).map(ops.bound_value(lb=10, ub=30)) listen = the_circus.create_story( name="listen_events", initiating_population=users, member_id_field="UID", timer_gen=timer_gen, activity_gen=bounded_gaussian_activity_gen) share = the_circus.create_story(name="share_events", initiating_population=users, member_id_field="UID", timer_gen=timer_gen, activity_gen=bounded_gaussian_activity_gen) repo = the_circus.populations["music_repository"] select_genre_and_song = ops.Chain( users.ops.lookup(id_field="UID", select={ "FIRST_NAME": "USER_FIRST_NAME", "LAST_NAME": "USER_LAST_NAME", }), # picks a genre at random repo.ops.select_one(named_as="GENRE"), # picks a song at random for that genre repo.get_relationship("songs").ops.select_one(from_field="GENRE", named_as="SONG_ID"), ) listen.set_operations(select_genre_and_song, ops.FieldLogger("listen_events")) share.set_operations( select_genre_and_song, # picks a user this song is shared to users.ops.select_one(named_as="SHARED_TO_UID"), # note we could post-check when user shared a song to their own uid # here, in which case we can use DropRow to discard that share event ops.FieldLogger("share_events"))
def add_bulk_restock_actions(circus, params, buyer_actor_name, seller_actor_name): buyer = circus.actors[buyer_actor_name] seller = circus.actors[seller_actor_name] pos_per_buyer = circus.actors["pos"].size / buyer.size for product, description in params["products"].items(): action_name = "{}_{}_bulk_purchase".format(buyer_actor_name, product) upper_level_restock_action_name = "{}_{}_bulk_purchase".format( seller_actor_name, product) logging.info("creating {} action".format(action_name)) # generator of item prices and type item_price_gen = random_generators.NumpyRandomGenerator( method="choice", a=description["item_prices"], seed=next(circus.seeder)) item_prices_gen = random_generators.DependentBulkGenerator( element_generator=item_price_gen) item_type_gen = random_generators.NumpyRandomGenerator( method="choice", a=circus.actors[product].ids, seed=next(circus.seeder)) item_types_gen = random_generators.DependentBulkGenerator( element_generator=item_type_gen) tx_gen = random_generators.SequencialGenerator( prefix="_".join(["TX", buyer_actor_name, product])) tx_seq_gen = random_generators.DependentBulkGenerator( element_generator=tx_gen) # trigger for another bulk purchase done by the seller if their own # stock get low seller_low_stock_bulk_purchase_trigger = random_generators.DependentTriggerGenerator( value_to_proba_mapper=operations.bounded_sigmoid( x_min=pos_per_buyer, x_max=description["max_pos_stock_triggering_pos_restock"] * pos_per_buyer, shape=description["restock_sigmoid_shape"], incrementing=False)) # bulk size distribution is a scaled version of POS bulk size distribution bulk_size_gen = scale_quantity_gen(stock_size_gen=circus.generators[ "pos_{}_bulk_size_gen".format(product)], scale_factor=pos_per_buyer) build_purchase_action = circus.create_story( name=action_name, initiating_actor=buyer, actorid_field="BUYER_ID", # no timer or activity: dealers bulk purchases are triggered externally ) build_purchase_action.set_operations( circus.clock.ops.timestamp(named_as="TIME"), buyer.get_relationship("{}__provider".format(product)) .ops.select_one(from_field="BUYER_ID", named_as="SELLER_ID"), bulk_size_gen.ops.generate(named_as="REQUESTED_BULK_SIZE"), buyer.get_relationship(product).ops .get_neighbourhood_size( from_field="BUYER_ID", named_as="OLD_BUYER_STOCK"), # TODO: the perfect case would prevent to go over max_stock at this point # selecting and removing Sims from dealers seller.get_relationship(product).ops \ .select_many( from_field="SELLER_ID", named_as="ITEM_IDS", quantity_field="REQUESTED_BULK_SIZE", # if an item is selected, it is removed from the dealer's stock pop=True, # TODO: put this back to False and log the failed purchases discard_missing=True), # and adding them to the buyer buyer.get_relationship(product).ops.add_grouped( from_field="BUYER_ID", grouped_items_field="ITEM_IDS"), # We do not track the old and new stock of the dealer since the result # is misleading: since all purchases are performed in parallel, # if a dealer is selected several times, its stock level after the # select_many() is the level _after_ all purchases are done, which is # typically not what we want to include in the log. buyer.get_relationship(product).ops \ .get_neighbourhood_size( from_field="BUYER_ID", named_as="NEW_BUYER_STOCK"), # actual number of bought items might be different due to out of stock operations.Apply(source_fields="ITEM_IDS", named_as="BULK_SIZE", f=lambda s: s.map(len), f_args="series"), # Generate some item prices. Note that the same items will have a # different price through the whole distribution chain item_prices_gen.ops.generate( named_as="ITEM_PRICES", observed_field="BULK_SIZE" ), item_types_gen.ops.generate( named_as="ITEM_TYPES", observed_field="BULK_SIZE" ), tx_seq_gen.ops.generate( named_as="TX_IDS", observed_field="BULK_SIZE" ), operations.FieldLogger(log_id="{}_stock".format(action_name), cols=["TIME", "BUYER_ID", "SELLER_ID", "OLD_BUYER_STOCK", "NEW_BUYER_STOCK", "BULK_SIZE"]), operations.FieldLogger(log_id=action_name, cols=["TIME", "BUYER_ID", "SELLER_ID"], exploded_cols=["TX_IDS", "ITEM_IDS", "ITEM_PRICES", "ITEM_TYPES"]), trigger_action_if_low_stock( circus, stock_relationship=seller.get_relationship(product), actor_id_field="SELLER_ID", restock_trigger=seller_low_stock_bulk_purchase_trigger, triggered_action_name=upper_level_restock_action_name ) )
def add_initial_stock_as_purchases(circus, buyer_actor_name, params): for product, description in params["products"].items(): action_name = "{}_{}_bulk_purchase".format(buyer_actor_name, product) logging.info("adding initial {} stock of {} as purchases".format( product, buyer_actor_name)) buyer = circus.actors[buyer_actor_name] # generator of item prices and type item_price_gen = random_generators.NumpyRandomGenerator( method="choice", a=description["item_prices"], seed=next(circus.seeder)) item_prices_gen = random_generators.DependentBulkGenerator( element_generator=item_price_gen) item_type_gen = random_generators.NumpyRandomGenerator( method="choice", a=circus.actors[product].ids, seed=next(circus.seeder)) item_types_gen = random_generators.DependentBulkGenerator( element_generator=item_type_gen) tx_gen = random_generators.SequencialGenerator( prefix="_".join(["TX_initial", buyer_actor_name, product])) tx_seq_gen = random_generators.DependentBulkGenerator( element_generator=tx_gen) log_stock = circus.create_story( name="initial_{}".format(action_name), initiating_actor=buyer, actorid_field="BUYER_ID", # everybody executes this action once, at the beginning timer_gen=random_generators.ConstantDependentGenerator(0), auto_reset_timer=False) # reset timer once so that it executes once log_stock.reset_timers() log_stock.set_operations( circus.clock.ops.timestamp(named_as="TIME", random=False), buyer.get_relationship( "{}__provider".format(product)).ops.select_one( from_field="BUYER_ID", named_as="SELLER_ID"), buyer.get_relationship(product).ops.select_all( from_field="BUYER_ID", named_as="ITEM_IDS"), operations.Apply(source_fields="ITEM_IDS", named_as="BULK_SIZE", f=lambda s: s.map(len), f_args="series"), item_prices_gen.ops.generate(named_as="ITEM_PRICES", observed_field="BULK_SIZE"), item_types_gen.ops.generate(named_as="ITEM_TYPES", observed_field="BULK_SIZE"), tx_seq_gen.ops.generate(named_as="TX_IDS", observed_field="BULK_SIZE"), random_generators.ConstantGenerator(value=0).ops.generate( named_as="OLD_BUYER_STOCK"), operations.Apply(source_fields="BULK_SIZE", named_as="NEW_BUYER_STOCK", f=lambda s: s, f_args="series"), operations.FieldLogger(log_id="{}_stock".format(action_name), cols=[ "TIME", "BUYER_ID", "SELLER_ID", "OLD_BUYER_STOCK", "NEW_BUYER_STOCK", "BULK_SIZE" ]), operations.FieldLogger(log_id=action_name, cols=["TIME", "BUYER_ID", "SELLER_ID"], exploded_cols=[ "TX_IDS", "ITEM_IDS", "ITEM_PRICES", "ITEM_TYPES" ]))
def gen(source, seed): return random_generators.NumpyRandomGenerator(method="choice", seed=seed, a=consts[source])
def create_purchase_story(the_circus): timer_gen = profilers.WorkHoursTimerGenerator(clock=the_circus.clock, seed=next(the_circus.seeder)) customers = the_circus.populations["customer"] purchase_story = the_circus.create_story( name="purchase", initiating_population=customers, member_id_field="CUST_ID", timer_gen=timer_gen, # this time not all customers have the activity level: on average # they will collectively perform 1 story per day, but some will do # on average more stories per day and some will do on average less # stories per day activity_gen=gen.NumpyRandomGenerator( method="exponential", scale=timer_gen.activity( n=1, per=pd.Timedelta("24h") ), seed=next(the_circus.seeder)) ) customers_items = customers.get_relationship("my_items") pos = the_circus.populations["point_of_sale"] pos_items = pos.get_relationship("items") purchase_story.set_operations( customers.ops.lookup(id_field="CUST_ID", select={ "FIRST_NAME": "BUYER_FIRST_NAME", "LAST_NAME": "BUYER_LAST_NAME"}), pos.ops.select_one(named_as="POS_ID"), pos.ops.lookup(id_field="POS_ID", select={"COMPANY": "POS_NAME"}), # pick an item from the vendor's stock pos_items.ops.select_one( # join the POS table on the POS_ID field of the story_data from_field="POS_ID", # the result of that join is to be populated into that field named_as="BOUGHT_ITEM_ID", # each joined item should be unique (2 customers cannot buy the # same item) one_to_one=True, # remove the joined items from the POS relationship pop=True, # in case some POS is out of stock, just drop the row in the # story_data. (As an alternative, we could keep it and trigger # some retries for the empty value later on.. ) discard_empty=True), # adds the item to the "my_items" relations of each customer customers_items.ops.add( # story_data field containing the added item item_field="BOUGHT_ITEM_ID", # story_data field containing the "from" side of the relations # (i..e the id of the customer buying the item in this case) from_field="CUST_ID" ), ops.FieldLogger(log_id="purchases") )