def sample_collection(**kwargs): """exceptions: fewer records than sample size sample size not an integer source does not exist destination already exists database connection exceptions """ db = Repository(kwargs['host'], kwargs['database']) source = db.get_collection(kwargs['source']) destination = db.get_collection(kwargs['destination']) # TODO pass filter in from json file doc_filter = { 'attributes': { '$gt': {} }, 'attributes.course': { '$nin': ['Desserts', 'Cocktails', 'Beverages'] }, 'rating': { '$gte': 4, '$lte': 5 } } if doc_filter: record_count = source.count(doc_filter) else: record_count = source.count() seed = kwargs['seed'] sample_size = parse_number(kwargs['size']) random.seed(seed) to_sample = random.sample(range(0, record_count), sample_size) to_sample.sort() progress = ProgressBar(sample_size) progress.start() if doc_filter: cursor = source.find(doc_filter) else: cursor = source.find() sample_count = 0 position = 0 for index in to_sample: while position <= index: record = cursor.next() position += 1 # TODO batch insert? destination.insert_one(record) sample_count += 1 progress.update(sample_count) progress.end()
class IndexService: def __init__(self, **kwargs): self.host = kwargs.get('host') self.database = kwargs.get('database') self.db = Repository(self.host, self.database) self.recipes = self.db.get_collection(kwargs.get('recipes')) self.combinations = self.db.get_collection(kwargs.get('combinations')) def index(self): self.recipes.create_index('ingredients') self.combinations.create_index('r')
class YummlyIngredientsService: URL = "http://api.yummly.com/v1/api/metadata/ingredient" APP_ID = "2a6406ac" APP_KEY = "c0aac363d1a1c8b1925e5f8898c69a48" def __init__(self, **kwargs): self.host = kwargs.get('host') self.database = kwargs.get('database') self.db = Repository(self.host, self.database) self.collection = self.db.get_collection(kwargs.get('collection')) self.skip = kwargs.get("skip") def get_ingredients(self): headers = { 'X-Yummly-App-ID': YummlyIngredientsService.APP_ID, 'X-Yummly-App-Key': YummlyIngredientsService.APP_KEY } r = requests.get(YummlyIngredientsService.URL, headers=headers) response_body = r.text # trim jsonp callback response_body = response_body.partition(',')[2][:-2] response_json = json.loads(response_body) self.collection.insert_many(response_json)
class PopulateGraphService: def __init__(self, **kwargs): self.host = kwargs.get('host') self.neoHost = kwargs.get('neoHost') self.database = kwargs.get('database') self.resume = kwargs.get('resume') self.db = Repository(self.host, self.database) self.recipes = self.db.get_collection(kwargs.get('recipes')) def populate(self): #TODO set uniqueness constraints if not exists reset_graph() records_count = self.recipes.count() output.push('Populating graph...') progress = ProgressBar(records_count) processed = 0 progress.start() cursor = self.recipes.find() for record in cursor: web_id = record['id'] recipeName = record['recipeName'] recipe = Recipe(id=web_id) ingredients = [] for ingredient_result in record['ingredients']: ingredients.append(Ingredient(name=ingredient_result)) recipe.add() recipe.require_ingredients(ingredients) processed += 1 if processed % 100 == 0: progress.update(processed) progress.update(processed) cursor.close() progress.end()
class LinkService: def __init__(self, **kwargs): self.host = kwargs.get('host') self.database = kwargs.get('database') self.resume = kwargs.get('resume') self.r_min = int(kwargs["r_min"]) self.r_max = int(kwargs["r_max"]) self.db = Repository(self.host, self.database) self.combinations = self.db.get_collection(kwargs.get('combinations')) def link(self): """exceptions: """ # TODO take r from options if present # TODO get max/min r in combinations # r_max = int(self.combinations.find({}).sort([("r", -1)]).limit(1).next()['r']) # r_min = int(self.combinations.find({}).sort([("r", 1)]).limit(1).next()['r']) if self.r_min == 1: self.r_min += 1 if self.r_max < self.r_min: output.push("No combinations linkable...") return record_filter = {"r": {"$gte": self.r_min, "$lte": self.r_max}} records_count = self.combinations.count(record_filter) progress = ProgressBar(records_count) processed = 0 BULK_LIMIT = 100 bulk = self.combinations.initialize_unordered_bulk_op() output.push("Linking combinations...") progress.start() cursor = self.combinations.find(record_filter, no_cursor_timeout=True) for combo in cursor: ingredients = list(combo['ingredients']) combo_id = combo['_id'] score = combo['score'] for i in range(len(ingredients)): givens = ingredients[:i] + ingredients[i + 1:] candidate = ingredients[i] givens.sort() givens_combo_id = "::".join(givens) bulk.find({ "_id": givens_combo_id }).update({ "$addToSet": { "pairings": { "name": candidate, "score": score, "ref_id": combo_id } } }) processed += 1 if processed % BULK_LIMIT == 0: progress.update(processed) # TODO handle bulk execute errors bulk.execute() bulk = self.combinations.initialize_unordered_bulk_op() progress.update(processed) bulk.execute() cursor.close() progress.end()
class SortService(): def __init__(self, **kwargs): self.host = kwargs.get('host') self.database = kwargs.get('database') self.r_min = int(kwargs["r_min"]) self.r_max = int(kwargs["r_max"]) self.db = Repository(self.host, self.database) self.combinations = self.db.get_collection(kwargs.get('combinations')) def sort_pairings(self): """exceptions: """ combo_filter = { "pairings": { "$gt": [] }, "r": { "$gte": self.r_min, "$lte": self.r_max } } records_count = self.combinations.count(combo_filter) progress = ProgressBar(records_count) processed = 0 BULK_LIMIT = 1000 bulk = self.combinations.initialize_unordered_bulk_op() output.push("Sorting pairings...") progress.start() cursor = self.combinations.find(combo_filter) for combo in cursor: combo_id = combo['_id'] bulk.find({ "_id": combo_id }).update( {"$push": { "pairings": { "$each": [], "$sort": { "score": -1 } } }}) processed += 1 if processed % BULK_LIMIT == 0: progress.update(processed) # TODO handle bulk execute errors bulk.execute() bulk = self.combinations.initialize_unordered_bulk_op() progress.update(processed) bulk.execute() cursor.close() progress.end()
class AndCountService: def __init__(self, **kwargs): self.db = Repository(kwargs.get('host'), kwargs.get('database')) self.recipes = self.db.get_collection(kwargs.get('recipes')) self.combinations = self.db.get_collection(kwargs.get('combinations')) self.skip = kwargs.get('skip') self.r_min = kwargs.get('r_min') self.r_max = kwargs.get('r_max') def count_and(self): """exceptions: not a recipe data store source does not exist destination already exists r_max/r_min is not an integer """ # TODO register exit handler to print recipes processed on unexpected exit # TODO https://docs.python.org/3/library/atexit.html recipe_count = self.recipes.count() cursor = self.recipes.find(no_cursor_timeout=True) # TODO timeout=False is bad practice if self.skip: cursor.skip(self.skip) processed = self.skip else: processed = 0 output.push("Counting ands...") progress = ProgressBar(recipe_count) progress.start() for recipe in cursor: # TODO try collecting counts into a dictionary and then updating less frequently # TODO Also play with batch size bulk = self.combinations.initialize_unordered_bulk_op() ingredients = recipe['ingredients'] ingredients.sort() # for each possible length of combinations between r_min and r_max r_min = int(self.r_min) r_max = int(self.r_max) r_max = int(r_max) if r_max and len(ingredients) > int( r_max) else len(ingredients) if r_min <= r_max: for r in range(r_min, r_max + 1): combinations = itertools.combinations(ingredients, r) # for each combination of that length for c in combinations: # ensure that ingredients in id are alphabetically ordered c = list(c) c.sort() combo_id = '::'.join(c) bulk.find({"_id": combo_id}).upsert()\ .update({ "$set": { "_id": combo_id, "r": r, "ingredients": c }, "$inc": { "and_count": 1 } } ) # TODO handle writeErrors bulk.execute() processed += 1 progress.update(processed) cursor.close() progress.end()
class OrCountService: def __init__(self, **kwargs): self.host = kwargs.get('host') self.database = kwargs.get('database') self.db = Repository(self.host, self.database) self.combinations = self.db.get_collection(kwargs.get('combinations')) self.r_min = int(kwargs["r_min"]) self.r_max = int(kwargs["r_max"]) self.skip = kwargs.get("skip") def count_or(self): """exceptions: not a collections data store collection does not exist r_max/r_min is not an integer r_min/r_max are currently required (should be optional) """ # TODO register exit handler to print recipes processed on unexpected exit # TODO https://docs.python.org/3/library/atexit.html combo_filter = { "r": { "$gte": self.r_min, "$lte": self.r_max }, "or_count": { "$exists": False } } combination_count = self.combinations.count(combo_filter) cursor = self.combinations.find(combo_filter, no_cursor_timeout=True) # TODO timeout=False is bad practice if self.skip: cursor.skip(self.skip) processed = self.skip else: processed = 0 progress = ProgressBar(combination_count) output.push("Counting ors...") progress.start() BULK_LIMIT = 1000 bulk = self.combinations.initialize_unordered_bulk_op() for combination in cursor: combo_id = combination['_id'] ingredients = combination['ingredients'] # see https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle or_count = 0 add_sub = 1 for r in range(1, len(ingredients) + 1): combinations = itertools.combinations(ingredients, r) for c in combinations: c = list(c) c.sort() c_id = '::'.join(c) and_count = self.get_and_count_by_id(c_id) or_count += and_count * add_sub add_sub *= -1 bulk.find({ "_id": combo_id }).update({ "$set": { "or_count": or_count, "score": float(combination['and_count']) / or_count } }) processed += 1 if processed % BULK_LIMIT == 0: # TODO handle bulk execute errors progress.update(processed) bulk.execute() bulk = self.combinations.initialize_unordered_bulk_op() progress.update(processed) bulk.execute() cursor.close() progress.end() # 1 billion is too big for current settings # lru cache with max size of 0.5 billion # TODO look into changing memory allowance for user running this process. Also limit mongo's max memory @functools.lru_cache(5 * 10**6) def get_and_count_by_id(self, combo_id): return self.combinations.find_one({"_id": combo_id})['and_count']