def construct_schema_pymongo( collection: pymongo.collection.Collection, delimiter: str, sample_size: Optional[int] = None, ) -> Dict[Tuple[str, ...], SchemaDescription]: """ Calls construct_schema on a PyMongo collection. Returned schema is keyed by tuples of nested field names, with each value containing 'types', 'count', 'nullable', 'delimited_name', and 'type' attributes. Parameters ---------- collection: the PyMongo collection delimiter: string to concatenate field names by sample_size: number of items in the collection to sample (reads entire collection if not provided) """ if sample_size: # get sample documents in collection documents = collection.aggregate( [{"$sample": {"size": sample_size}}], allowDiskUse=True ) else: # if sample_size is not provided, just take all items in the collection documents = collection.find({}) return construct_schema(list(documents), delimiter)
def get_user_contracts(coll: pymongo.collection.Collection, user_id: ObjectId, sort_by="valid_from"): assert isinstance(user_id, ObjectId) cursor = coll.aggregate([ { "$match": { "_id": user_id } }, { "$unwind": "$contracts" }, { "$match": { "contracts.type": "dpp" } }, # TODO udělat to obecně ne jen pro dpp { "$sort": { f"contracts.{sort_by}": -1 } }, { "$group": { "_id": "$_id", "contracts": { "$push": "$contracts" } } } ]) return next(cursor, {}).get("contracts", [])
def find_flights_with_low_prices(threshold: int, search_date: str, collection: pymongo.collection.Collection, logger: logging.Logger) -> None: """ Finds flights with price lower than a threshold and returns all info for such flights (along with link to order tickets). """ stage_name = "GET_MIN_PRICE" # calculate price for each itinerary (can have several Legs) price_per_flight_pipeline = [{ "$match": { "Query.OutboundDate": search_date } }, { "$unwind": "$Itineraries" }, { "$project": { "Itineraries.OutboundLegId": { "$arrayToObject": [[{ "k": "$Itineraries.OutboundLegId", "v": { "$reduce": { "input": "$Itineraries.PricingOptions", "initialValue": 0, "in": { "$add": ["$$value", "$$this.Price"] } } } }]] } } }] price_per_flight_results = collection.aggregate(price_per_flight_pipeline) # find flights with prices < threshold flights_with_low_prices = [] for price_per_flight in price_per_flight_results: for k, v in price_per_flight['Itineraries']['OutboundLegId'].items(): if v < threshold: flights_with_low_prices.append(k) logger.info( f"{stage_name} - Found {len(flights_with_low_prices)} flights with prices lower than {threshold}" ) # return all flight data for resulted flights flights_data_pipeline = [] # TODO - request flights data from MongoDB
def aggregate( collection: pymongo.collection.Collection, aggregation: Iterable[Dict[Text, Any]], ) -> pymongo.command_cursor.CommandCursor: return collection.aggregate(list(aggregation), allowDiskUse=True)
def construct_schema_pymongo( collection: pymongo.collection.Collection, delimiter: str, use_random_sampling: bool, max_document_size: int, is_version_gte_4_4: bool, sample_size: Optional[int] = None, ) -> Dict[Tuple[str, ...], SchemaDescription]: """ Calls construct_schema on a PyMongo collection. Returned schema is keyed by tuples of nested field names, with each value containing 'types', 'count', 'nullable', 'delimited_name', and 'type' attributes. Parameters ---------- collection: the PyMongo collection delimiter: string to concatenate field names by sample_size: number of items in the collection to sample (reads entire collection if not provided) max_document_size: maximum size of the document that will be considered for generating the schema. """ doc_size_field = "temporary_doc_size_field" aggregations: List[Dict] = [] if is_version_gte_4_4: # create a temporary field to store the size of the document. filter on it and then remove it. aggregations = [ { "$addFields": { doc_size_field: { "$bsonSize": "$$ROOT" } } }, { "$match": { doc_size_field: { "$lt": max_document_size } } }, { "$project": { doc_size_field: 0 } }, ] if use_random_sampling: # get sample documents in collection aggregations.append({"$sample": {"size": sample_size}}) documents = collection.aggregate( aggregations, allowDiskUse=True, ) else: aggregations.append({"$limit": sample_size}) documents = collection.aggregate(aggregations, allowDiskUse=True) return construct_schema(list(documents), delimiter)
def getEarliestUpdateTime(c: pymongo.collection.Collection) -> datetime.datetime: result = c.aggregate([{'$group': {'_id': 'all', 'firstInsert': {'$min': '$updated'}}}]).next() print(result) return result['firstInsert']