def bulk_insert_collections(path, indexed, drop_on_start, drop_on_exit=False, write_concern=1): """ :param path: :param indexed: :param drop_on_start: :param drop_on_exit: :param write_concern: :return: """ if drop_on_start: drop_database_collections(DATABASE_COLLECTION) db = connect(HOST, PORT).get_database(DATABASE_COLLECTION) user_collection = db.get_collection( 'users', write_concern=pymongo.WriteConcern(w=write_concern)) tweet_collection = db.get_collection( 'tweets', write_concern=pymongo.WriteConcern(w=write_concern)) if indexed: create_indexes() else: remove_indexes() execution_time = 0 document = open(path, 'r') users = [] tweets = [] for doc in document: d = json.loads(doc) users.append(d['user']) # add the user id to the tweet collection d['user_id'] = d['user']['id'] del d['user'] tweets.append(d) start_time = time.time() user_collection.insert_many(users) tweet_collection.insert_many(tweets) execution_time += time.time() - start_time size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2)) logger.info("{} seconds to bulk_insert_collections {}".format( execution_time, size)) if drop_on_exit: drop_database(DATABASE) return execution_time, size
def find_collections(indexed): """ :param indexed: :return: """ if indexed: create_indexes() db = connect(HOST, PORT).get_database(DATABASE_COLLECTION) user_collection = db.get_collection('users', write_concern=pymongo.WriteConcern()) tweet_collection = db.get_collection('tweets', write_concern=pymongo.WriteConcern()) execution_time = 0 for i in range(5): count = 0 start_time = time.time() #count += user_collection.find({'location': 'London'}).count() #count += user_collection.find({'friends_count': {'$gt': 1000}}).count() #count += user_collection.find({'followers_count': {'$gt': 1000}}).count() count += user_collection.find({ '$and': [{ 'location': 'London' }, { 'friends_count': { '$gt': 1000 } }, { 'followers_count': { '$gt': 1000 } }] }).count() execution_time += time.time() - start_time logger.info("{} seconds to find_collections {} with indexed={}".format( execution_time, count, indexed)) return execution_time, count
def _setup_mongodb(self, logger): mongodb = self.config.logging.mongodb if mongodb: conn = self.config.sys.log if conn: conn = conn.connect() existing = conn.connection[ conn.database].list_collection_names() if conn.collection not in existing: try: conn.connection[conn.database].create_collection( name=conn.name, capped=True, size=self.config.logging.size) for idx in ("hostname", "identifier", "username", "qual_name"): conn.create_index([("created", pymongo.DESCENDING), (idx, pymongo.DESCENDING)], name=idx) except: self.logger.warning("failed to create [sys.log]") level = getattr(logging, mongodb) write_concern = self.config.logging.write_concern handler = core4.logger.handler.MongoLoggingHandler( conn.with_options(write_concern=pymongo.WriteConcern( w=write_concern))) handler.setLevel(level) logger.addHandler(handler) self._setup_tornado(handler, level) self.logger.debug( "mongodb logging setup complete, " "level [%s], write concern [%d]", mongodb, write_concern) else: raise core4.error.Core4SetupError( "config.logging.mongodb set, but config.sys.log is None")
def insert_fn(remaining_secs): remaining_millis = int(round(remaining_secs * 1000)) write_concern = pymongo.WriteConcern(w=2, wtimeout=remaining_millis) coll = client.resmoke.get_collection("await_ready", write_concern=write_concern) coll.insert_one({"awaiting": "ready"})
def await_ready(self): """Block until the fixture can be used for testing.""" # Wait for the config server if self.configsvr is not None: self.configsvr.await_ready() # Wait for each of the shards for shard in self.shards: shard.await_ready() # We call self._new_mongos() and mongos.setup() in self.await_ready() function # instead of self.setup() because mongos routers have to connect to a running cluster. if not self.mongos: for i in range(self.num_mongos): mongos = self._new_mongos(i, self.num_mongos) self.mongos.append(mongos) for mongos in self.mongos: # Start up the mongos. mongos.setup() # Wait for the mongos. mongos.await_ready() client = self.mongo_client() self._auth_to_db(client) # Turn off the balancer if it is not meant to be enabled. if not self.enable_balancer: self.stop_balancer() # Turn off autosplit if it is not meant to be enabled. if not self.enable_autosplit: wc = pymongo.WriteConcern(w="majority", wtimeout=30000) coll = client.config.get_collection("settings", write_concern=wc) coll.update_one({"_id": "autosplit"}, {"$set": { "enabled": False }}, upsert=True) # Inform mongos about each of the shards for shard in self.shards: self._add_shard(client, shard) # Ensure that all CSRS nodes are up to date. This is strictly needed for tests that use # multiple mongoses. In those cases, the first mongos initializes the contents of the config # database, but without waiting for those writes to replicate to all the config servers then # the secondary mongoses risk reading from a stale config server and seeing an empty config # database. self.configsvr.await_last_op_committed() # Enable sharding on each of the specified databases for db_name in self.enable_sharding: self.logger.info("Enabling sharding for '%s' database...", db_name) client.admin.command({"enablesharding": db_name}) # Ensure that the sessions collection gets auto-sharded by the config server if self.configsvr is not None: primary = self.configsvr.get_primary().mongo_client() primary.admin.command({"refreshLogicalSessionCacheNow": 1})
def bulk_insert_one(path, drop_on_start, drop_on_exit=False, write_concern=1): """ :param path: :param drop_on_start: :param drop_on_exit: :param write_concern: :return: """ # drop database if drop_on_start: drop_database(DATABASE) db = connect(HOST, PORT).get_database(DATABASE) coll = db.get_collection( COLLECTION, write_concern=pymongo.WriteConcern(w=write_concern)) document = open(path, 'r') start = time.time() for doc in document: coll.insert_one(json.loads(doc)) run = time.time() - start size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2)) logger.info("{} seconds to bulk insert one {}".format(run, size)) if drop_on_exit: drop_database(DATABASE) return run, size
def bulk_insert(path, indexed, drop_on_start, drop_on_exit=False, write_concern=1): """ Bulk insert into MongoDB database :param path: :param indexed: insert into benchmark_db_indexed [default = False] :param drop_on_start: :param drop_on_exit: :param write_concern: :return: """ # check drop flag if drop_on_start: drop_database(DATABASE) if indexed: create_indexes() else: remove_indexes() # connect to correct database: db = connect(HOST, PORT).get_database(DATABASE) coll = db.get_collection( COLLECTION, write_concern=pymongo.WriteConcern(w=write_concern)) document = open(path, 'r') docs = [] for doc in document: docs.append(json.loads(doc)) start = time.time() coll.insert_many(docs) run = time.time() - start if indexed: create_indexes() else: remove_indexes() size = "{}MB".format(round(os.path.getsize(path) / 1024 / 1024, 2)) logger.info("{} seconds to bulk_insert {}, indexed={}".format( run, size, indexed)) # check drop flag on exit if drop_on_exit: drop_database(DATABASE) return run, size
def __init__(self, logger): # type: (Logging.Logger) -> None self._logger = logger try: self._mongo_connection = MongoConnection() self._client = self._mongo_connection.client() self._db = self._client.get_database( name=os.getenv('GREASE_MONGO_DB', 'grease'), write_concern=pymongo.WriteConcern(w=0)) self._collection = self._db.get_collection(name='source_dedup') self._dedup = True except ServerSelectionTimeoutError: self._mongo_connection = None self._client = None self._db = None self._collection = None self._dedup = False
def init_app(self, app: Flask, uri=None, db_name=None, **kwargs): self.app = app if uri: self.uri = uri if db_name: self.db_name = db_name if kwargs: self.kwargs.update(kwargs) if not self.uri: self.uri = app.config['MONGO_URL'] self.app = app self.client = pymongo.MongoClient(self.uri, **self.kwargs) self.database = self.client.get_database( self.db_name, write_concern=pymongo.WriteConcern(w='majority')) self.gridfs = gridfs.GridFS(self.database) # mongo 文件存储, coll: fs.* self.app.extensions['mongo'] = self
def trigger(self, name, channel=None, data=None, author=None): """ Triggers an event in collection ``sys.event``. This methods uses a special mongo connection with write concern ``0``. If the collection ``sys.event`` does not exist, it is created as a capped collection with size configured by key ``config.event.size``. :param name: of the event :param channel: of the event, defaults to channel name ``system`` :param data: to be attached to the event :param author: of the event, defaults to the current username :return: event id (MongoDB ``_id``) """ if self._event is None: conn = self.config.sys.event.connect(concurr=False) if conn: wc = self.config.event.write_concern conn.with_options(write_concern=pymongo.WriteConcern(w=wc)) self.logger.debug( "mongodb event setup complete, write concern [%d]", wc) else: raise core4.error.Core4SetupError("config.event not set") existing = conn.connection[ conn.database].list_collection_names() if conn.collection not in existing: conn.connection[conn.database].create_collection( name=conn.name, capped=True, size=self.config.event.size ) self._event = conn doc = { "created": core4.util.node.mongo_now(), "name": name, "author": author or core4.util.node.get_username(), "channel": channel or core4.const.DEFAULT_CHANNEL } if data: doc["data"] = data inserted = self._event.insert_one(doc) return inserted.inserted_id
def message_databases(self): """List of message databases, ordered by partition number.""" kwargs = {} if not self.server_version < (2, 6): # NOTE(flaper87): Skip mongodb versions below 2.6 when # setting the write concern on the database. pymongo 3.0 # fails with norepl when creating indexes. doc = self.connection.write_concern.document.copy() doc.setdefault('w', 'majority') doc.setdefault('j', False) kwargs['write_concern'] = pymongo.WriteConcern(**doc) name = self.mongodb_conf.database partitions = self.mongodb_conf.partitions databases = [] for p in range(partitions): db_name = name + self._COL_SUFIX + str(p) databases.append(self.connection.get_database(db_name, **kwargs)) return databases
def deleteFile(self, file): """ Delete all of the chunks in the collection that correspond to the given file. """ q = { 'chunkUuid': file['chunkUuid'], 'assetstoreId': self.assetstore['_id'] } matching = File().find(q, limit=2, projection=[]) if matching.count(True) == 1: # If we can't reach the database, we return anyway. A system check # will be necessary to remove the abandoned file. Since we already # can handle that case, tell Mongo to use a 0 write concern -- we # don't need to know that the chunks have been deleted, and this # can be faster. try: self.chunkColl.with_options(write_concern=pymongo.WriteConcern( w=0)).delete_many({'uuid': file['chunkUuid']}) except pymongo.errors.AutoReconnect: pass
def mongo_save(database, collection_key, id_key, data): """Save results to MongoDB database. Args: database (:class:`pymongo.database.Database`): MongoDB database to save results to. collection_key (str): name of collection. id_key (str): id key with which to store :attr:`data`. data (:class:`bson.binary.Binary` or dict): data to store in :attr:`db`. """ collection = database[collection_key].with_options( write_concern=pymongo.WriteConcern(w=1)) tries_left = _MONGO_MAX_TRIES while tries_left > 0: tries_left -= 1 try: collection.replace_one({'_id': id_key}, data, upsert=True) return except (pymongo.errors.WriteConcernError, pymongo.errors.WriteError): if tries_left == 0: print(f"Warning: could not write entry to mongodb after" f" {_MONGO_MAX_TRIES} attempts.") raise
def upsert_data(self, zipref, collection_name_prefix, filename, skiprows, parse_dates, indice=[], na_values=["NIL", "/0"], drop_columns=["eNodeB Function Name"], rename_columns={"Local cell name": "Cell Name"}): # skip if file read already myclient = pymongo.MongoClient(self.MONGO_CLIENT_URL) mydb = myclient[self.DB_NAME] mycol = mydb["read_files"] if mycol.find_one({"_id": filename}) != None: print("{} has been loaded already".format(filename)) myclient.close() return myclient.close() # read df st = time.time() file_extention = os.path.splitext(filename)[1] if file_extention == ".csv": # skip Total XXX Records temp = io.StringIO() with zipref.open(filename) as f: for line in f.readlines(): line = line.decode("utf-8") if not line.startswith("Total"): temp.write(line) temp.seek(0) df = pd.read_csv(temp, parse_dates=parse_dates, skiprows=skiprows, na_values=na_values) elif file_extention == ".xlsx": zipref.extract(filename, "extract") df = pd.read_excel(os.path.join("extract", filename), parse_dates=parse_dates, na_values=na_values) # if Date and Time are split into 2 columsn, combine them if "Date" in df.columns and "Time" in df.columns: df["Date"] = df.apply(lambda row: row["Date"].replace(" DST", ""), axis=1) df["Date"] = pd.to_datetime(df["Date"]) df["Time"] = pd.to_timedelta(df["Time"]) df["Time"] = df["Time"] - pd.to_timedelta(df["Time"].dt.days, unit='d') df["Time"] = df["Date"] + df["Time"] # rename columns df = df.rename(columns=rename_columns) # convert Cell name / Site name to string for col in ["Cell Name", "Site Name", "eNodeB Name", "gNodeB Nam"]: if col in df.columns: df = df.astype({col: str}) # remove () in kpi name for 4g/5g if self.tech == "4G" or self.tech == "5G": p = re.compile("\(.+\)") columns = [p.sub("", x) for x in df.columns] # remove suffix after : in kpi name for 2g if self.tech == "2G": columns = [x.split(":")[0] for x in df.columns] df.columns = columns # drop columns df.drop(columns=[col for col in drop_columns if col in df.columns], inplace=True) print("load", time.time() - st) # # run agg # st = time.time() # if agg_function: # agg = {} # for field in agg_function: # if field in df.columns: # agg[field] = agg_function[field] # df = df.groupby(parse_dates + unique_index, as_index=False).agg(agg) # print("agg", time.time() - st) # deal with auto_complete st = time.time() for auto_complete_field, auto_complete_collection in self.auto_complete_fields: if auto_complete_field in df.columns: s = set(df[auto_complete_field].unique()) - self.auto_complete_existed_sets[auto_complete_collection] self.auto_complete_existed_sets[auto_complete_collection].update(s) self.to_add_auto_complete_sets[auto_complete_collection].update(s) print("deal with auto_complete", time.time() - st) # divide df by time and insert onebyone st = time.time() df.columns = [x.replace(".", "_") for x in df.columns] time_col = parse_dates[0] for dt in df[time_col].unique(): t_df = df[df[time_col] == dt] dt = dt.astype('M8[ms]').astype('O') if time_col == "Time": collection_name = collection_name_prefix + dt.strftime("%Y%m%d%H") if time_col == "Date": collection_name = collection_name_prefix + dt.strftime("%Y%m%d") data = t_df.to_dict(orient='records') # Here's our added param.. print("trans df with {} rows in {}:".format(len(data), collection_name), time.time() - st) st = time.time() myclient = pymongo.MongoClient(self.MONGO_CLIENT_URL) mydb = myclient[self.DB_NAME] if collection_name not in mydb.list_collection_names(): self.create_collection(collection_name, indice) mycol = mydb.get_collection(collection_name, write_concern=pymongo.WriteConcern(w=0)) mycol.insert_many(data, ordered=False) myclient.close() print("Insert {} rows in {}".format(len(data), collection_name), time.time() - st) # insert filename in to read_files myclient = pymongo.MongoClient(self.MONGO_CLIENT_URL) mydb = myclient[self.DB_NAME] mycol = mydb["read_files"] mycol.insert_one({"_id": filename}) myclient.close() print("{} inserted".format(filename)) self.insert_to_add_auto_complete_set()
def process_obj(collection_name, logger, source_name, source_max, source_pointer, field_set, source_obj, final, strength=None): # first thing try to find the object level hash mongo_connection = MongoConnection() client = mongo_connection.client() db = client.get_database(name=os.getenv('GREASE_MONGO_DB', 'grease'), write_concern=pymongo.WriteConcern(w=0)) collection = db.get_collection(name=collection_name) hash_obj = collection.find_one( {'hash': SourceDeDuplify.generate_hash(source_obj)}) if not hash_obj: logger.debug( "Failed To Locate Type1 Match, Performing Type2 Search Match", True) # Globally unique hash for request # create a completely new document hash and all the field set hashes collection.insert_one({ 'expiry': SourceDeDuplify.generate_expiry_time(), 'max_expiry': SourceDeDuplify.generate_max_expiry_time(), 'source': str(source_name), 'score': 1, 'hash': SourceDeDuplify.generate_hash(source_obj), 'type': 1 }) # Next start field level processing # first check if our fields are limited if len(field_set) < 1: # All fields need to be considered for de-dup fields = source_obj.keys() else: # only registered fields fields = field_set # now lets get the composite score composite_score = SourceDeDuplify.get_field_score( collection, logger, source_name, source_obj, fields) if source_pointer is 0: compo_spot = 1 else: compo_spot = source_pointer logger.debug("DEDUPLICATION COMPOSITE SCORE [" + str(compo_spot) + "/" + str(source_max) + "]: " + str(composite_score)) # now lets observe to see if we have a 'unique' source if strength is None: composite_score_limit = float( os.getenv('GREASE_DEDUP_SCORE', 85)) else: if isinstance(strength, int) or isinstance(strength, float): logger.debug("Global DeDuplication strength override", verbose=True) composite_score_limit = float(strength) else: composite_score_limit = float( os.getenv('GREASE_DEDUP_SCORE', 85)) if composite_score < composite_score_limit: # look at that its time to add it to the final list logger.debug("Type2 ruled Unique adding to final result", True) final.append(source_obj) else: # we have a duplicate source document # increase the counter and expiry and move on (DROP) logger.debug("Type1 match found, dropping", True) if 'max_expiry' in hash_obj: update_statement = { "$set": { 'score': int(hash_obj['score']) + 1, 'expiry': SourceDeDuplify.generate_expiry_time() } } else: update_statement = { "$set": { 'score': int(hash_obj['score']) + 1, 'expiry': SourceDeDuplify.generate_expiry_time(), 'max_expiry': SourceDeDuplify.generate_max_expiry_time() } } collection.update_one({'_id': hash_obj['_id']}, update_statement) mongo_connection.client().close()
def create_fake(__document_class__, __db__=None, __parent__=None, __name__=None, __faker__=None, __depth__=DEFAULT_DEPTH, __write_concern__=pymongo.WriteConcern(w='majority'), **values): """ Create document with fake data. :param yadm.documents.BaseDocument __document_class__: document class for new instance :param yadm.database.Database __db__: database instance if specified, document and all references will be saved to database :param yadm.documents.BaseDocument __parent__: parent document :param str __name__: name of parent field :param Faker __faker__: faker instance, create if not specified :param int __depth__: maximum recursion depth, not recomendated use greater than 450 (default 4) :return yadm.documents.BaseDocument: __document_class__ instance with fake data """ if not issubclass(__document_class__, BaseDocument): raise TypeError("only BaseDocument subclasses is allowed") if __depth__ < 0: return AttributeNotSet if __faker__ is None: __faker__ = Faker() document = __document_class__() if isinstance(document, Document): document.__db__ = __db__ elif isinstance(document, EmbeddedDocument): document.__parent__ = __parent__ document.__name__ = __name__ doc_fake_proc = document.__fake__(values, __faker__, __depth__ - 1) # extend values from __fake__ method if isinstance(doc_fake_proc, GeneratorType): values = next(doc_fake_proc) elif isinstance(doc_fake_proc, dict): values = doc_fake_proc # first: set values for name, fake in values.items(): if fake is not AttributeNotSet: setattr(document, name, fake) # second: field faker for name, field in __document_class__.__fields__.items(): if name not in values and not hasattr(document, '__fake__{}__'.format(name)): fake = field.get_fake(document, __faker__, __depth__ - 1) if fake is not AttributeNotSet: setattr(document, name, fake) # third: __fake__{name}__ methods for name, field in __document_class__.__fields__.items(): if name not in values and hasattr(document, '__fake__{}__'.format(name)): attr = getattr(document, '__fake__{}__'.format(name)) fake = attr(__faker__, __depth__ - 1) if fake is not AttributeNotSet: setattr(document, name, fake) if isinstance(doc_fake_proc, GeneratorType): # pre save processor try: next(doc_fake_proc) except StopIteration: doc_fake_proc = None if __db__ is not None: __db__.insert(document, write_concern=__write_concern__) # post save processor if isinstance(doc_fake_proc, GeneratorType): try: next(doc_fake_proc) except StopIteration: pass return document
def insert_one_collections(path, indexed, drop_on_start, drop_on_exit=False, write_concern=1): """ Inserts a single document to the benchmark_db database :param path: :param indexed: :param drop_on_start: :param drop_on_exit: :param write_concern: :return: Parameters: indexed - insert with indexes doc_path - database document path drop_on_start - drop database before query drop_on_exit - drop database after query Returns: insert_one_time - execution time for one insert bulk_insert_time - execution time for bulk insert doc_size - size of the inserted document db_size - size of the database""" if indexed: create_indexes() else: remove_indexes() if drop_on_start: drop_database_collections(DATABASE_COLLECTION) db = connect(HOST, PORT).get_database(DATABASE_COLLECTION) user_collection = db.get_collection( 'users', write_concern=pymongo.WriteConcern(w=write_concern)) tweet_collection = db.get_collection( 'tweets', write_concern=pymongo.WriteConcern(w=write_concern)) d1 = open(path, 'r') docs = [] users = [] tweets = [] for doc in d1: d = json.loads(doc) users.append(d['user']) # add the user id to the tweet collection d['user_id'] = d['user']['id'] del d['user'] tweets.append(d) start = time.time() user_collection.insert_many(users) tweet_collection.insert_many(tweets) bulk_insert_time = time.time() - start d2 = open(DOCUMENT_SINGLE, 'r') for doc in d2: d = json.loads(doc) users.append(d['user']) # add the user id to the tweet collection d['user_id'] = d['user']['id'] del d['user'] tweets.append(d) start = time.time() user_collection.insert_one(users.pop()) tweet_collection.insert_one(tweets.pop()) insert_one_time = time.time() - start doc_size = "{}MB".format( round(os.path.getsize(DOCUMENT_SINGLE) / 1024 / 1024, 2)) db_size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2)) logger.info( "{} seconds to insert one collections indexed={} db_size={} doc_size={}" .format(insert_one_time, indexed, db_size, doc_size)) if drop_on_exit: drop_database(DATABASE_COLLECTION) return insert_one_time, doc_size, db_size, bulk_insert_time
def insert_one(path, indexed, drop_on_start, drop_on_exit=False, write_concern=1): """ Inserts a single document to the benchmark_db database :param path: :param indexed: :param drop_on_start: :param drop_on_exit: :param write_concern: :return: Parameters: indexed - insert with indexes doc_path - database document path drop_on_start - drop database before query drop_on_exit - drop database after query Returns: insert_one_time - execution time for one insert bulk_insert_time - execution time for bulk insert doc_size - size of the inserted document db_size - size of the database""" if indexed: create_indexes() else: remove_indexes() if drop_on_start: drop_database(DATABASE) db = connect(HOST, PORT).get_database(DATABASE) coll = db.get_collection( COLLECTION, write_concern=pymongo.WriteConcern(w=write_concern)) d1 = open(path, 'r') docs = [] for doc in d1: docs.append(json.loads(doc)) d2 = open(DOCUMENT_SINGLE, 'r') single_doc = json.load(d2) start = time.time() coll.insert_many(docs) bulk_insert_time = time.time() - start start = time.time() coll.insert_one(single_doc) insert_one_time = time.time() - start doc_size = "{}MB".format( round(os.path.getsize(DOCUMENT_SINGLE) / 1024 / 1024, 2)) db_size = "{}MB".format(round(os.path.getsize(DOCUMENT) / 1024 / 1024, 2)) logger.info( "{} seconds to insert one indexed={} db_size={} doc_size={}".format( insert_one_time, indexed, db_size, doc_size)) if drop_on_exit: drop_database(DATABASE) return insert_one_time, doc_size, db_size, bulk_insert_time