def insert_doc(doc: dict, col: pymongo.collection.Collection): if_insert = True if doc["danmaku"]: for each in col.find({"danmaku": doc["danmaku"]}): if_insert = False break elif doc["uid"]: for each in col.find({"uid": doc["uid"]}): if_insert = False break if if_insert: col.insert_one(doc) print("insert: " + str(doc))
def construct_schema_pymongo( collection: pymongo.collection.Collection, delimiter: str, sample_size: Optional[int] = None, ) -> Dict[Tuple[str, ...], SchemaDescription]: """ Calls construct_schema on a PyMongo collection. Returned schema is keyed by tuples of nested field names, with each value containing 'types', 'count', 'nullable', 'delimited_name', and 'type' attributes. Parameters ---------- collection: the PyMongo collection delimiter: string to concatenate field names by sample_size: number of items in the collection to sample (reads entire collection if not provided) """ if sample_size: # get sample documents in collection documents = collection.aggregate( [{"$sample": {"size": sample_size}}], allowDiskUse=True ) else: # if sample_size is not provided, just take all items in the collection documents = collection.find({}) return construct_schema(list(documents), delimiter)
def download_neighbors( collection:pymongo.collection.Collection, source:str, limit:int=0, )->List[Tuple[str, float]]: """ Returns the neighbors of a given node as queried from the graph. If too many neighbors exist, we can take a random selection based on limit. If limit is 0, we take all. """ query_res = collection.find( {"source": source}, projection={ "target": 1, "weight": 1, "_id": 0, }, limit=limit ) res = [] for val in query_res: try: res.append((str(val["target"]), float(val["weight"]))) except ValueError: print() print(f"Node {source} has an invalid edge: {val}") return res
def get_all_items(collection: pymongo.collection.Collection, model: Type[BaseModel], *, page: int = 1, num_per_page: int = 20, query: dict = None, projection: dict = None): """ Retrieve all items from a collection :param collection: Collection to query :param model: Class which the JSON in the collection represents :param page: Page number to retrieve. #ToDo: implement correct server-side pagination :param num_per_page: Number of items per page to retrieve. Defaults to 20. :param query: Return only objects that contain the query :param projection: Filter to exclude keys from each result :return: List of objects in the collection """ projection = {} if projection is None else projection projection.update(ignore_mongo_id) collection_json = list(collection.find(filter=query, projection=projection) .skip((page - 1) * num_per_page) .limit(num_per_page)) return [model(**item_json) for item_json in collection_json]
def get_all(collection: pymongo.collection.Collection) -> List[str]: instruments: List[str] = list() cursor = collection.find({}) for document in cursor: instruments.append(document['symbole']) return instruments
def select_documents_after( refobj: ObjectId, collection: pymongo.collection.Collection) -> pymongo.cursor.Cursor: time_filter = {"_id": {"$gt": refobj}} cursor = collection.find(time_filter).sort('date').limit( MAX_DOCUMENT_LOADED) return cursor
def _get_last_datetime(collection: pymongo.collection.Collection): last_datetime = collection.find(filter={}, projection={ 'datetime': -1, '_id': 0 }, sort=[('datetime', -1)], limit=1) return list(last_datetime)
def __extract_fields_from_collection(mongo_collection: pymongo.collection.Collection, field_names: list) -> pymongo.cursor.Cursor: """ Extract only given field names from collection items :param mongo_collection: Collection from which data will be extracted :param field_names: Name of fields which will be extracted from collection item :return: Cursor which refer to given collection """ field_names_dict = {key: True for key in field_names} extracted_fields_cursor = mongo_collection.find({}, field_names_dict) return extracted_fields_cursor
def find_many(col: pymongo.collection.Collection, filter_dict: dict, sort_by: str = None, reverse: bool = False, limit: int = Constants.FIND_LIMIT, offset: int = 0, include_id: bool = False) -> list: result = col.find(filter_dict, {"_id": 1 if include_id else 0}) if sort_by is not None: result = result.sort(sort_by, pymongo.DESCENDING if reverse else pymongo.ASCENDING) result = result.skip(offset).limit(limit) return list(result)
def get_memes_as_entries( memes_collection: pymongo.collection.Collection) -> list: """Get all the memes from the database and return it as a printable list of entries.""" memes = memes_collection.find().sort("name") memes_list = [] for item in memes: name = item["name"] description = item["description"] times_used = item["times_used"] meme_entry = f"- {name} | {description} | times used: {times_used}" memes_list.append(meme_entry) return memes_list
def fix_all_descriptions(self, collection: pymongo.collection.Collection): """One time function to fix all descriptions Args: collection (pymongo.collection.Collection): MongoDB collection to be updated """ offers = collection.find() for offer in tqdm.tqdm(offers, desc="Fixing the descriptions"): try: description_details = self.parse_single_description(offer) collection.update_one({"id": offer['id']}, {"$set": description_details}) except IndexError: print("ERROR")
def mongodb_get_array( coll: pymongo.collection.Collection, meta_id: ObjectId, name: str, chunk: Optional[Tuple[int, ...]], ) -> np.ndarray: """Load all MongoDB documents making up a dask chunk and assemble them into an array """ find_key = {"meta_id": meta_id, "name": name, "chunk": chunk} docs = list( coll.find(find_key, { "dtype": 1, "shape": 1, "data": 1 }).sort("n")) return docs_to_array(docs, find_key)
def build_users(database: pymongo.collection.Collection): users = dict() cursor = database.find() # get all documents counter = 0 for document in cursor: current_user_id = get_id_from_document(document) if current_user_id not in users: users[current_user_id] = PersonFactory.create(document) users[current_user_id].update(document) counter += 1 if counter > 2000: break if counter % 1000 == 0: print(counter) return users.values()
def get_difference(collection: pymongo.collection.Collection, field: str, values: pd.Series) -> pd.Series: """ Parameters ---------- collection field values Returns ------- """ cursor = collection.find({field: {"$in": values}}) intersection = [document.get(field) for document in cursor] difference = set(values) - set(intersection) return pd.Series(list(difference))
def update_users_client_metrics(user_collection: pymongo.collection.Collection, from_date: str, to_date: str, dry_run: bool = True) -> None: """Update user data with client-side metrics from Amplitude.""" users = user_collection.find( { 'registeredAt': { '$gt': from_date, '$lt': to_date }, 'clientMetrics.amplitudeId': { '$nin': [_AMPLITUDE_ID_NOT_FOUND, 'REDACTED'] }, 'clientMetrics.isFirstSessionMobile': { '$exists': False }, }, projection={ '_id': 1, 'clientMetrics': 1 }) num_users_updated = 0 for user in users: try: _update_user_client_metric(user_collection, user, dry_run) except TooManyRequestsException: # The API is limited to 360 requests, so if we manage to get 200 # users it's expected to get an error here: no need to warn Sentry. if num_users_updated > 200: logging.info('Too many requests after updating %d users', num_users_updated) return raise num_users_updated += 1
def exploreChunks(self, ilon_chunk: int, ilat_chunk: int, delta: int, mask_query: Union[dict, None], retrn: str, col_grid: pymongo.collection.Collection)\ -> Union[dict, pymongo.cursor.Cursor]: ''' Explore an xarray chunk and returns either the number of grid cells or the grid ids. Parameters ---------- ilon_chunk : int Longitude of the upper-left bounding box corner. ilon_chunk : int Latitude of the upper-left bounding box corner. delta : int Width and height of the bounding box (in degrees). mask_query : Union[dict, None] If all grid cells needs to be considered, set mask_query=None. If only certain grid cells needs to be considered, filter with this query. Example: for only land grid cells (i.e., excluding oceans) mask_query = {'lsm': {'gt': 0.6}}. Land-sea mask (lsm) has fractional values in the range 0 (sea) to 1 (land). retrn : str What to return: * either 'ndocs' for the number or grid cells inside the chunk * or 'docs' for the ids of the grid cells inside the chunk. col_grid : pymongo.collection.Collection Mongo connection to the grid collection. Returns ------- Union[dict, pymongo.cursor.Cursor] ''' ilon_orig = int(ilon_chunk) ilon_chunk = int(self._shiftlon(x=ilon_chunk)) ilon_plus = int(self._shiftlon(x=ilon_chunk + delta)) ilat_chunk = int(ilat_chunk) geoqry = { 'loc': { '$geoWithin': { '$geometry': { 'type': 'Polygon', 'coordinates': [[[ilon_chunk, ilat_chunk], [ilon_plus, ilat_chunk], [ilon_plus, ilat_chunk + delta], [ilon_chunk, ilat_chunk + delta], [ilon_chunk, ilat_chunk]]] } } } } if mask_query is not None: geoqry.update(mask_query) if retrn == 'ndocs': # How many grid cells in this chunk ? res = { 'ilon_chunk': ilon_orig, 'ilat_chunk': ilat_chunk, 'n': col_grid.count_documents(filter=geoqry) } elif retrn == 'docs': res = col_grid.find(geoqry, {'id_grid': 1, 'loc': 1, '_id': 0}) return (res)
def find( query: Dict[Text, Any], collection: pymongo.collection.Collection, ) -> pymongo.cursor.Cursor: return collection.find(query)
def convert_currencies(self, rates: RatesConverter, collection: pymongo.collection.Collection): """Maps the wages to PLN currencies for all offers in DB - we run it on all offers to adjust for currency fluctuation Args: rates (RatesConverter): RatesConverter instance to convert collection (pymongo.collection.Collection): MongoDB collection to be updated """ def map_single_salary_field(salary: Dict, salary_field: str, currency_rate: float): """Helper method to convert a single field - adds a new field with _pln ending Args: salary (Dict): nested document with salary info salary_field (str): name of the salary field currency_rate (float): Currency rate from target currency to PLN Returns: Dict: Updated salary chunk """ if salary[salary_field] != 'undisclosed': salary[salary_field] = int(salary[salary_field]) salary[f'{salary_field}_pln'] = int( int(salary[salary_field]) * currency_rate) else: salary[f'{salary_field}_pln'] = 'undisclosed' return salary def map_single_salary(salary: Dict, currency_rate: float): """Maps all fields in a single salary field Args: salary (Dict): nested document with salary info currency_rate (float): Currency rate from target currency to PLN Returns: Dict: Updated salary chunk """ salary = map_single_salary_field(salary, 'upper_range', currency_rate) salary = map_single_salary_field(salary, 'lower_range', currency_rate) if salary['upper_range'] != 'undisclosed': salary['average'] = (salary['upper_range'] + salary['lower_range']) / 2 else: salary['average'] = 'undisclosed' salary = map_single_salary_field(salary, 'average', currency_rate) return salary offers = collection.find() for offer in tqdm.tqdm(offers, desc="Mapping the currencies to PLN"): try: new_salaries = [] for salary in offer["salary"]: new_salaries.append( map_single_salary(salary, rates[salary['currency']])) collection.update_one({"id": offer['id']}, {"$set": { "salary": new_salaries }}) except IndexError: print("ERROR converting currencies")
def get_all_memes(memes_collection: pymongo.collection.Collection) -> list: """Index all the memes from the shelve database and display a list of memes to the user.""" memes = memes_collection.find() return memes
def get_users(coll: pymongo.collection.Collection, **by): by_copy = dict(by) by_copy["type"] = "user" users = list(coll.find(by_copy)) return users