def _analyze_orders_similarity(args): order, data_set, min_similarity, offset, limit = args logger.info("{} {} {}".format(order['_id'], offset, limit)) repository = Repository(data_set=data_set) progress = Progress(limit - 1) for orders2 in batch(repository.get_orders(offset=offset, limit=limit - 1), 1000): for o2 in orders2: if o2['_id'] == order['_id']: continue progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( order['products'], o2['products']) if similarity >= min_similarity: similar = dict(order1_id=order['order_id'], user1_id=order['user_id'], order2_id=o2['order_id'], user2_id=o2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) repository.add_orders_similarity(similar) logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))
def analyze_orders_similarity(data_set, samples): repository = Repository(data_set=data_set) progress = Progress(math.ceil(((samples - 1) * samples) / 2)) similarity_threshold = 0.2 offset = 1 for orders1 in batch(repository.get_orders(limit=samples - 1), 100): for o1 in orders1: max_similarity = similarity_threshold similar = None count = 0 for orders2 in batch( repository.get_orders(offset=offset, limit=samples - offset), 100): for o2 in orders2: progress.advance() similarity, common, additional1, additional2 = calculate_products_similarity( o1['products'], o2['products']) if similarity > max_similarity: max_similarity = similarity similar = dict(order1_id=o1['order_id'], user1_id=o1['user_id'], order2_id=o2['order_id'], user2_id=o2['user_id'], similarity=similarity, common_products=common, add_products1=additional1, add_products2=additional2) logger.info("Similarity {} {} {}".format( similar['user1_id'], similar['user2_id'], similarity)) if similar is not None: repository.add_orders_similarity(similar) offset += 1 logger.info("{:.1f}% ETA {}".format(progress.get_progress(), progress.get_estimated_time()))