def populate(self): #TODO set uniqueness constraints if not exists reset_graph() records_count = self.recipes.count() output.push('Populating graph...') progress = ProgressBar(records_count) processed = 0 progress.start() cursor = self.recipes.find() for record in cursor: web_id = record['id'] recipeName = record['recipeName'] recipe = Recipe(id=web_id) ingredients = [] for ingredient_result in record['ingredients']: ingredients.append(Ingredient(name=ingredient_result)) recipe.add() recipe.require_ingredients(ingredients) processed += 1 if processed % 100 == 0: progress.update(processed) progress.update(processed) cursor.close() progress.end()
def precalc(**kwargs): start = time.time() IndexService(**kwargs).index() AndCountService(**kwargs).count_and() OrCountService(**kwargs).count_or() LinkService(**kwargs).link() SortService(**kwargs).sort_pairings() end = time.time() elapsed = progress_bar._format_time(end - start) output.push("Total elapsed: {elapsed}".format(elapsed=elapsed))
def sort_pairings(self): """exceptions: """ combo_filter = { "pairings": { "$gt": [] }, "r": { "$gte": self.r_min, "$lte": self.r_max } } records_count = self.combinations.count(combo_filter) progress = ProgressBar(records_count) processed = 0 BULK_LIMIT = 1000 bulk = self.combinations.initialize_unordered_bulk_op() output.push("Sorting pairings...") progress.start() cursor = self.combinations.find(combo_filter) for combo in cursor: combo_id = combo['_id'] bulk.find({ "_id": combo_id }).update( {"$push": { "pairings": { "$each": [], "$sort": { "score": -1 } } }}) processed += 1 if processed % BULK_LIMIT == 0: progress.update(processed) # TODO handle bulk execute errors bulk.execute() bulk = self.combinations.initialize_unordered_bulk_op() progress.update(processed) bulk.execute() cursor.close() progress.end()
def link(self): """exceptions: """ # TODO take r from options if present # TODO get max/min r in combinations # r_max = int(self.combinations.find({}).sort([("r", -1)]).limit(1).next()['r']) # r_min = int(self.combinations.find({}).sort([("r", 1)]).limit(1).next()['r']) if self.r_min == 1: self.r_min += 1 if self.r_max < self.r_min: output.push("No combinations linkable...") return record_filter = {"r": {"$gte": self.r_min, "$lte": self.r_max}} records_count = self.combinations.count(record_filter) progress = ProgressBar(records_count) processed = 0 BULK_LIMIT = 100 bulk = self.combinations.initialize_unordered_bulk_op() output.push("Linking combinations...") progress.start() cursor = self.combinations.find(record_filter, no_cursor_timeout=True) for combo in cursor: ingredients = list(combo['ingredients']) combo_id = combo['_id'] score = combo['score'] for i in range(len(ingredients)): givens = ingredients[:i] + ingredients[i + 1:] candidate = ingredients[i] givens.sort() givens_combo_id = "::".join(givens) bulk.find({ "_id": givens_combo_id }).update({ "$addToSet": { "pairings": { "name": candidate, "score": score, "ref_id": combo_id } } }) processed += 1 if processed % BULK_LIMIT == 0: progress.update(processed) # TODO handle bulk execute errors bulk.execute() bulk = self.combinations.initialize_unordered_bulk_op() progress.update(processed) bulk.execute() cursor.close() progress.end()
def end(self): output.push("")
def count_and(self): """exceptions: not a recipe data store source does not exist destination already exists r_max/r_min is not an integer """ # TODO register exit handler to print recipes processed on unexpected exit # TODO https://docs.python.org/3/library/atexit.html recipe_count = self.recipes.count() cursor = self.recipes.find(no_cursor_timeout=True) # TODO timeout=False is bad practice if self.skip: cursor.skip(self.skip) processed = self.skip else: processed = 0 output.push("Counting ands...") progress = ProgressBar(recipe_count) progress.start() for recipe in cursor: # TODO try collecting counts into a dictionary and then updating less frequently # TODO Also play with batch size bulk = self.combinations.initialize_unordered_bulk_op() ingredients = recipe['ingredients'] ingredients.sort() # for each possible length of combinations between r_min and r_max r_min = int(self.r_min) r_max = int(self.r_max) r_max = int(r_max) if r_max and len(ingredients) > int( r_max) else len(ingredients) if r_min <= r_max: for r in range(r_min, r_max + 1): combinations = itertools.combinations(ingredients, r) # for each combination of that length for c in combinations: # ensure that ingredients in id are alphabetically ordered c = list(c) c.sort() combo_id = '::'.join(c) bulk.find({"_id": combo_id}).upsert()\ .update({ "$set": { "_id": combo_id, "r": r, "ingredients": c }, "$inc": { "and_count": 1 } } ) # TODO handle writeErrors bulk.execute() processed += 1 progress.update(processed) cursor.close() progress.end()
def count_or(self): """exceptions: not a collections data store collection does not exist r_max/r_min is not an integer r_min/r_max are currently required (should be optional) """ # TODO register exit handler to print recipes processed on unexpected exit # TODO https://docs.python.org/3/library/atexit.html combo_filter = { "r": { "$gte": self.r_min, "$lte": self.r_max }, "or_count": { "$exists": False } } combination_count = self.combinations.count(combo_filter) cursor = self.combinations.find(combo_filter, no_cursor_timeout=True) # TODO timeout=False is bad practice if self.skip: cursor.skip(self.skip) processed = self.skip else: processed = 0 progress = ProgressBar(combination_count) output.push("Counting ors...") progress.start() BULK_LIMIT = 1000 bulk = self.combinations.initialize_unordered_bulk_op() for combination in cursor: combo_id = combination['_id'] ingredients = combination['ingredients'] # see https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle or_count = 0 add_sub = 1 for r in range(1, len(ingredients) + 1): combinations = itertools.combinations(ingredients, r) for c in combinations: c = list(c) c.sort() c_id = '::'.join(c) and_count = self.get_and_count_by_id(c_id) or_count += and_count * add_sub add_sub *= -1 bulk.find({ "_id": combo_id }).update({ "$set": { "or_count": or_count, "score": float(combination['and_count']) / or_count } }) processed += 1 if processed % BULK_LIMIT == 0: # TODO handle bulk execute errors progress.update(processed) bulk.execute() bulk = self.combinations.initialize_unordered_bulk_op() progress.update(processed) bulk.execute() cursor.close() progress.end()