def set_database_client(self, database_host, database_port=None): """ Set a database client by database_host(and database_port) :param database_host: the host address of database client :type database_host: :class:`str` :param database_port: the port of database client :type database_port: :class:`str` """ database_host_has_port = False database_address = database_host # check if port is already in the database_host address if ":" in database_host: database_host_has_port = True # add port if not database_host_has_port and database_port: database_address += ":" + database_port # sanity check temp_db_client = self._db_client try: self._db_client = DBClient(database_address) self._db_client.server_info() except Exception as err: # restore the _db_client self._db_client = temp_db_client raise MsPASSError( "Runntime error: cannot create a database client with: " + database_address, "Fatal", )
def setup_class(self): self.client = DBClient("localhost") self.client.drop_database("test_manager") db = Database(self.client, "test_manager") db["history_global"].drop_indexes() # clean up the database locally for col_name in db.list_collection_names(): db[col_name].delete_many({}) self.manager = GlobalHistoryManager(db, "test_job", collection="history_global")
def setup_class(self): client = DBClient("localhost") self.db = Database(client, "test_dbclean") self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db["site"].insert_one({ "_id": site_id, "net": "net", "sta": "sta", "loc": "loc", "lat": 1.0, "lon": 1.0, "elev": 2.0, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), }) self.db["channel"].insert_one({ "_id": channel_id, "net": "net1", "sta": "sta1", "loc": "loc1", "chan": "chan", "lat": 1.1, "lon": 1.1, "elev": 2.1, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), "edepth": 3.0, "vang": 1.0, "hang": 1.0, }) self.db["source"].insert_one({ "_id": source_id, "lat": 1.2, "lon": 1.2, "time": datetime.utcnow().timestamp(), "depth": 3.1, "magnitude": 1.0, }) self.test_ts["site_id"] = site_id self.test_ts["source_id"] = source_id self.test_ts["channel_id"] = channel_id
class TestDBClient: def setup_class(self): self.c1 = DBClient("mongodb://localhost/my_database") self.c2 = DBClient("localhost") def test_init(self): assert self.c1._DBClient__default_database_name == "my_database" def test_getitem(self): assert self.c1["my_database"].name == "my_database" assert self.c2["my_db"].name == "my_db" def test_get_default_database(self): assert self.c1.get_default_database().name == "my_database" with pytest.raises(pymongo.errors.ConfigurationError, match="No default database"): self.c2.get_default_database() def test_get_database(self): assert self.c1.get_database().name == "my_database" assert self.c2.get_database("my_db").name == "my_db" with pytest.raises(pymongo.errors.ConfigurationError, match="No default database"): self.c2.get_database()
class TestManager: def setup_class(self): self.client = DBClient("localhost") self.client.drop_database("test_manager") db = Database(self.client, "test_manager") db["history_global"].drop_indexes() # clean up the database locally for col_name in db.list_collection_names(): db[col_name].delete_many({}) self.manager = GlobalHistoryManager(db, "test_job", collection="history_global") def test_init(self): assert self.manager.job_name == "test_job" assert self.manager.collection == "history_global" assert self.manager.history_db.name == "test_manager" def test_logging(self): alg_id = ObjectId() manager_db = Database(self.client, "test_manager") manager_db["history_global"].delete_many({}) self.manager.logging(alg_id, "test_alg_name", "test_parameter") res = manager_db["history_global"].find_one( {"job_name": self.manager.job_name}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "test_alg_name" assert res["alg_id"] == alg_id assert res["parameters"] == "test_parameter" assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 1) # clean up manager_db["history_global"].delete_many({}) def test_mspass_map(self, spark_context): l = [get_live_timeseries() for i in range(5)] # add net, sta, chan, loc to avoid metadata serialization problem for i in range(5): l[i]["chan"] = "HHZ" l[i]["loc"] = "test_loc" l[i]["net"] = "test_net" l[i]["sta"] = "test_sta" l[i].set_as_origin("test", "0", str(i), AtomicType.TIMESERIES) # test mspass_map for spark spark_res = spark_map(l, self.manager, spark_context) manager_db = Database(self.client, "test_manager") assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 1) res = manager_db["history_global"].find_one( {"job_name": self.manager.job_name}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "filter" assert ( res["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) spark_alg_id = res["alg_id"] # test mspass_map for dask dask_res = dask_map(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 2) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 2) docs = manager_db["history_global"].find({"alg_id": spark_alg_id}) assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id assert docs[0]["job_name"] == docs[1][ "job_name"] == self.manager.job_name assert docs[0]["alg_name"] == docs[1]["alg_name"] == "filter" assert ( docs[0]["parameters"] == docs[1]["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) assert not docs[0]["time"] == docs[1]["time"] # same alg + parameters combination -> same alg_id dask_res = dask_map(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 3) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 3) # SPARK test user provided alg_name and parameter(exist) spark_alg_name = "filter" spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" spark_res = spark_map( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 4) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 4) # SPARK test user provided alg_name and parameter(new) spark_alg_name = "new_filter" spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" spark_res = spark_map( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 5) assert (manager_db["history_global"].count_documents( {"alg_name": "new_filter"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "new_filter"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_filter" assert ( res["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) new_spark_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_spark_alg_id}) == 1) # DASK test user provided alg_name and parameter(exist) dask_alg_name = "filter" dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 6) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 5) # DASK test user provided alg_name and parameter(new) dask_alg_name = "new_filter_2" dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 7) assert (manager_db["history_global"].count_documents( {"alg_name": "new_filter_2"}) == 1) res = manager_db["history_global"].find_one( {"alg_name": "new_filter_2"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_filter_2" assert ( res["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) new_dask_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_dask_alg_id}) == 1) manager_db["history_object"].delete_many({}) # test spark mspass_map for save_data data = spark_context.parallelize(l) data_map = data.mspass_map(manager_db.save_data, global_history=self.manager) save_list = data_map.collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 8) assert (manager_db["history_global"].count_documents( {"alg_name": "save_data"}) == 1) # check object history after save_data manager_db["history_object"].count_documents({}) == 5 manager_db["wf_TimeSeries"].count_documents({}) == 5 history_object_docs = manager_db["history_object"].find({}) idx = 0 doc_alg_id = None doc_ids = [] for doc in history_object_docs: if not doc_alg_id: doc_alg_id = doc["alg_id"] else: assert doc_alg_id == doc["alg_id"] doc_ids.append(doc["_id"]) assert doc["alg_name"] == "save_data" idx += 1 assert sorted(doc_ids) == ["0", "1", "2", "3", "4"] # test spark mspass_map for read_data save_l = [res[1] for res in save_list] data = spark_context.parallelize(save_l) data_map = data.mspass_map(manager_db.read_data, global_history=self.manager) read_list = data_map.collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 9) assert (manager_db["history_global"].count_documents( {"alg_name": "read_data"}) == 1) manager_db["history_object"].delete_many({}) manager_db["wf_TimeSeries"].delete_many({}) # test dask mspass_map for save_data data = daskbag.from_sequence(l) data_map = data.mspass_map(manager_db.save_data, global_history=self.manager) save_list = data_map.compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 10) assert (manager_db["history_global"].count_documents( {"alg_name": "save_data"}) == 2) res = manager_db["history_global"].find({"alg_name": "save_data"}) assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name assert res[0]["alg_name"] == res[1]["alg_name"] == "save_data" assert (res[0]["parameters"] == res[1]["parameters"] == '{"object_history": "False"}') assert res[0]["alg_id"] == res[1]["alg_id"] # check object history after save_data manager_db["history_object"].count_documents({}) == 5 manager_db["wf_TimeSeries"].count_documents({}) == 5 history_object_docs = manager_db["history_object"].find({}) idx = 0 doc_alg_id = None doc_ids = [] for doc in history_object_docs: if not doc_alg_id: doc_alg_id = doc["alg_id"] else: assert doc_alg_id == doc["alg_id"] doc_ids.append(doc["_id"]) assert doc["alg_name"] == "save_data" idx += 1 assert sorted(doc_ids) == ["0", "1", "2", "3", "4"] # test dask mspass_map for read_data save_l = [res[1] for res in save_list] data = daskbag.from_sequence(save_l) data_map = data.mspass_map(manager_db.read_data, global_history=self.manager) read_list = data_map.compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 11) assert (manager_db["history_global"].count_documents( {"alg_name": "read_data"}) == 2) res = manager_db["history_global"].find({"alg_name": "read_data"}) assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name assert res[0]["alg_name"] == res[1]["alg_name"] == "read_data" assert (res[0]["parameters"] == res[1]["parameters"] == '{"object_history": "False"}') assert res[0]["alg_id"] == res[1]["alg_id"] def test_mspass_reduce(self, spark_context): manager_db = Database(self.client, "test_manager") manager_db["history_global"].delete_many({}) l = [get_live_timeseries() for i in range(5)] # test mspass_reduce for spark spark_res = spark_reduce(l, self.manager, spark_context) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 1) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 1 res = manager_db["history_global"].find_one({"alg_name": "stack"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}' spark_alg_id = res["alg_id"] # test mspass_reduce for dask dask_res = dask_reduce(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 2) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 2 assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 1) docs = manager_db["history_global"].find({"alg_name": "stack"}) for doc in docs: if doc["alg_id"] == spark_alg_id: continue res = doc assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}' # different alg -> different alg_id assert not res["alg_id"] == spark_alg_id dask_alg_id = res["alg_id"] # same alg + parameters combination -> same alg_id dask_res = dask_reduce(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 3) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 3 assert (manager_db["history_global"].count_documents( {"alg_id": dask_alg_id}) == 2) docs = manager_db["history_global"].find({"alg_id": dask_alg_id}) doc1 = docs[0] doc2 = docs[1] assert not doc1["time"] == doc2["time"] assert doc1["job_id"] == doc2["job_id"] assert doc1["job_name"] == doc2["job_name"] assert doc1["alg_name"] == doc2["alg_name"] assert doc1["parameters"] == doc2["parameters"] # SPARK test user provided alg_name and parameter(exist) spark_alg_name = "stack" spark_alg_parameters = "object_history=True,alg_id=2" spark_res = spark_reduce( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 4) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 4 assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 2) # SPARK test user provided alg_name and parameter(new) spark_alg_name = "new_stack" spark_alg_parameters = "object_history=True,alg_id=2" spark_res = spark_reduce( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 5) assert (manager_db["history_global"].count_documents( {"alg_name": "new_stack"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "new_stack"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}' # DASK test user provided alg_name and parameter(exist) dask_alg_name = "stack" dask_alg_parameters = "object_history=True,alg_id=3" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 6) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 5 assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 3) # DASK test user provided alg_name and parameter(new) dask_alg_name = "new_stack" dask_alg_parameters = "object_history=True,alg_id=3" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 7) assert (manager_db["history_global"].count_documents({ "alg_name": "new_stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 1) res = manager_db["history_global"].find_one({ "alg_name": "new_stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}' def test_mspass_map_with_filePath(self, spark_context): # test mapass_map for spark (file input) # data input of RFdecon, needed for parallelization d = [get_live_seismogram(71, 2.0) for i in range(5)] for i in range(5): d[i].t0 = -5 # parameters string pfPath = "python/mspasspy/data/pf/RFdeconProcessor.pf" pf = AntelopePf(pfPath) pf_dict = AntelopePf2dict(pf) parameter_dict = collections.OrderedDict() parameter_dict["alg"] = "LeastSquares" parameter_dict["pf"] = pf_dict parameter_dict["object_history"] = "True" gTree = ParameterGTree(parameter_dict) json_params = json.dumps(gTree.asdict()) data = spark_context.parallelize(d) spark_res = data.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=None, parameters=None, ).collect() manager_db = Database(self.client, "test_manager") assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 8) res = manager_db["history_global"].find_one({"alg_name": "RFdecon"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "RFdecon" assert res["parameters"] == json_params spark_alg_id = res["alg_id"] # test mspass_map for dask ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=None, parameters=None, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 9) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 2) docs = manager_db["history_global"].find({"alg_id": spark_alg_id}) assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id assert docs[0]["job_name"] == docs[1][ "job_name"] == self.manager.job_name assert docs[0]["alg_name"] == docs[1]["alg_name"] == "RFdecon" assert docs[0]["parameters"] == docs[1]["parameters"] == json_params assert not docs[0]["time"] == docs[1]["time"] # same alg + parameters combination -> same alg_id ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=None, parameters=None, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 10) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 3) # SPARK test user provided alg_name and parameter(exist) spark_alg_name = "RFdecon" spark_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) data = spark_context.parallelize(d) spark_res = data.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=spark_alg_name, parameters=spark_alg_parameters, ).collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 11) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 4) # SPARK test user provided alg_name and parameter(new) spark_alg_name = "RFdecon_2" spark_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) data = spark_context.parallelize(d) spark_res = data.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=spark_alg_name, parameters=spark_alg_parameters, ).collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 12) assert (manager_db["history_global"].count_documents( {"alg_name": "RFdecon_2"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "RFdecon_2"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "RFdecon_2" assert res["parameters"] == json_params new_spark_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_spark_alg_id}) == 1) # DASK test user provided alg_name and parameter(exist) dask_alg_name = "RFdecon" dask_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 13) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 5) # DASK test user provided alg_name and parameter(new) dask_alg_name = "RFdecon_3" dask_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 14) assert (manager_db["history_global"].count_documents( {"alg_name": "RFdecon_3"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "RFdecon_3"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "RFdecon_3" assert res["parameters"] == json_params new_dask_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_dask_alg_id}) == 1) def test_get_alg_id(self): manager_db = Database(self.client, "test_manager") assert not self.manager.get_alg_id("aaa", "bbb") res = manager_db["history_global"].find_one({ "alg_name": "new_stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) assert (self.manager.get_alg_id( "new_stack", '{"object_history": "True", "alg_id": "3"}') == res["alg_id"]) def test_get_alg_list(self): assert (len( self.manager.get_alg_list(self.manager.job_name, job_id=self.manager.job_id)) == 14) def test_set_alg_name_and_parameters(self): manager_db = Database(self.client, "test_manager") assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 3) res = manager_db["history_global"].find_one({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) alg_id = res["alg_id"] self.manager.set_alg_name_and_parameters(alg_id, "test_alg_name", "test_parameters") assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 0) assert (manager_db["history_global"].count_documents({ "alg_name": "test_alg_name", "parameters": "test_parameters" }) == 3) res = manager_db["history_global"].find_one({ "alg_name": "test_alg_name", "parameters": "test_parameters" }) assert res["alg_id"] == alg_id def test_object_history(self, spark_context): manager_db = Database(self.client, "test_manager") manager_db["history_global"].delete_many({}) manager_db["history_object"].delete_many({}) l = [get_live_timeseries() for i in range(2)] # add net, sta, chan, loc to avoid metadata serialization problem for i in range(2): l[i]["chan"] = "HHZ" l[i]["loc"] = "test_loc" l[i]["net"] = "test_net" l[i]["sta"] = "test_sta" spark_res = spark_map(l, self.manager, spark_context) assert manager_db["history_global"].count_documents( {"alg_name": "filter"}) == 1 res = manager_db["history_global"].find_one({"alg_name": "filter"}) alg_id = res["alg_id"] # check status of the mspass objects for ts in spark_res: assert ts.number_of_stages() == 1 assert ts.current_nodedata().algorithm == "filter" assert ts.current_nodedata().algid == str(alg_id) assert ts.is_volatile() save_res = manager_db.save_data(spark_res[0], alg_name="filter", alg_id=str(alg_id)) # hardcode net, sta, net, loc to avoid serialization problem here, they are readonly metadata keys -> non fatal keys = 4 assert save_res.live assert manager_db["history_object"].count_documents( {"alg_name": "filter"}) == 1 doc = manager_db["history_object"].find_one({"alg_name": "filter"}) assert doc assert doc["_id"] == spark_res[0].current_nodedata().uuid assert doc["wf_TimeSeries_id"] == spark_res[0]["_id"] assert doc["alg_id"] == str(alg_id) assert doc["alg_name"] == "filter"
def main(args=None): """ """ if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser( prog="dbclean", usage= "%(prog)s dbname collection [-ft] [-d k1 ...] [-r kold:knew ... ] [-v] [-h]", description="MsPASS program to fix most errors detected by dbverify", ) parser.add_argument("dbname", metavar="dbname", type=str, help="MongoDB database name to be fixed") parser.add_argument( "collection", metavar="collection", type=str, help="MongoDB collection name to be fixed", ) parser.add_argument( "-ft", "--fixtypes", action="store_true", help="Enable automatic type mismatch repair", ) parser.add_argument( "-d", "--delete", nargs="*", default=[], help="List of keys of key-value pairs to be deleted from all documents", ) parser.add_argument( "-r", "--rename", nargs="*", default=[], help= "Change the keys of documents using pattern defined in args of form oldkey:newkey", ) parser.add_argument( "-v", "--verbose", action="store_true", help="When used be echo each fix - default works silently", ) args = parser.parse_args(args) dbname = args.dbname collection = args.collection fixtypes = args.fixtypes delete = args.delete rename = args.rename verbose = args.verbose # not a very robust way to detect this condition but it should work # it is not robust because it assumes a behavior in argparse for # args with a list if len(delete) > 0: enable_deletion = True else: enable_deletion = False if len(rename) > 0: enable_rename = True else: enable_rename = False if not (fixtypes or enable_deletion or enable_rename): print("Usage error: you must define at least one clean operation") print("Type: dbclean --help to get usage help") exit(-1) if enable_rename: rename_map = rename_list_to_dict(rename) dbclient = DBClient() db = Database(dbclient, dbname) print("Starting processing of ", collection, " collection of database named=", dbname) # Intentionally do the delete and rename operations before # a type check to allow cleaning any keys. The set of dicts below # accumulate counts of edits for each key if enable_deletion: delcounts = db._delete_attributes(collection, delete, verbose=verbose) print("delete processing compeleted on collection=", collection) print("Number of documents changed for each key requested follow:") print(json_util.dumps(delcounts, indent=4)) if enable_rename: repcounts = db._rename_attributes(collection, rename_map, verbose=verbose) print("rename processing compeleted on collection=", collection) print("Here is the set of changes requested:") print(json_util.dumps(rename_map)) print("Number of documents changed for each key requested follow:") print(json_util.dumps(repcounts, indent=4)) if fixtypes: fixcounts = db._fix_attribute_types(collection, verbose=verbose) print("fixtype processing compeleted on collection=", collection) print("Keys of documents changed and number changed follow:") print(json_util.dumps(fixcounts, indent=4))
def setup_class(self): self.c1 = DBClient("mongodb://localhost/my_database") self.c2 = DBClient("localhost")
def __init__( self, database_host=None, scheduler=None, scheduler_host=None, job_name="mspass", database_name="mspass", schema=None, collection=None, ): # job_name should be a string if database_host is not None and not type(database_host) is str: raise MsPASSError( "database_host should be a string but " + str(type(database_host)) + " is found.", "Fatal", ) if scheduler is not None and scheduler != "dask" and scheduler != "spark": raise MsPASSError( "scheduler should be either dask or spark but " + str(scheduler) + " is found.", "Fatal", ) if scheduler_host is not None and not type(scheduler_host) is str: raise MsPASSError( "scheduler_host should be a string but " + str(type(scheduler_host)) + " is found.", "Fatal", ) if job_name is not None and not type(job_name) is str: raise MsPASSError( "job_name should be a string but " + str(type(job_name)) + " is found.", "Fatal", ) if database_name is not None and not type(database_name) is str: raise MsPASSError( "database_name should be a string but " + str(type(database_name)) + " is found.", "Fatal", ) # collection should be a string if collection is not None and type(collection) is not str: raise MsPASSError( "collection should be a string but " + str(type(collection)) + " is found.", "Fatal", ) # check env variables MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS") MONGODB_PORT = os.environ.get("MONGODB_PORT") MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER") MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS") DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT") SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT") # create a database client # priority: parameter -> env -> default database_host_has_port = False if database_host: database_address = database_host # check if database_host contains port number already if ":" in database_address: database_host_has_port = True elif MSPASS_DB_ADDRESS: database_address = MSPASS_DB_ADDRESS else: database_address = "localhost" # add port if not database_host_has_port and MONGODB_PORT: database_address += ":" + MONGODB_PORT try: self._db_client = DBClient(database_address) self._db_client.server_info() except Exception as err: raise MsPASSError( "Runntime error: cannot create a database client with: " + database_address, "Fatal", ) # set default database name self._default_database_name = database_name self._default_schema = schema self._default_collection = collection # create a Global History Manager if schema: global_history_manager_db = Database(self._db_client, database_name, db_schema=schema) else: global_history_manager_db = Database(self._db_client, database_name) self._global_history_manager = GlobalHistoryManager( global_history_manager_db, job_name, collection=collection) # set scheduler if scheduler: self._scheduler = scheduler elif MSPASS_SCHEDULER: self._scheduler = MSPASS_SCHEDULER else: self._scheduler = "dask" # scheduler configuration if self._scheduler == "spark": scheduler_host_has_port = False if scheduler_host: self._spark_master_url = scheduler_host # add spark:// prefix if not exist if "spark://" not in scheduler_host: self._spark_master_url = "spark://" + self._spark_master_url # check if spark host address contains port number already if self._spark_master_url.count(":") == 2: scheduler_host_has_port = True elif MSPASS_SCHEDULER_ADDRESS: self._spark_master_url = MSPASS_SCHEDULER_ADDRESS # add spark:// prefix if not exist if "spark://" not in MSPASS_SCHEDULER_ADDRESS: self._spark_master_url = "spark://" + self._spark_master_url else: self._spark_master_url = "local" # add port number # 1. not the default 'local' # 2. scheduler_host and does not contain port number # 3. SPARK_MASTER_PORT exists if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS) and not scheduler_host_has_port and SPARK_MASTER_PORT): self._spark_master_url += ":" + SPARK_MASTER_PORT # sanity check try: spark = (SparkSession.builder.appName("mspass").master( self._spark_master_url).getOrCreate()) self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( "Runntime error: cannot create a spark configuration with: " + self._spark_master_url, "Fatal", ) elif self._scheduler == "dask": # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS: self._dask_client = DaskClient() else: scheduler_host_has_port = False # set host if scheduler_host: self._dask_client_address = scheduler_host # check if scheduler_host contains port number already if ":" in scheduler_host: scheduler_host_has_port = True else: self._dask_client_address = MSPASS_SCHEDULER_ADDRESS # add port if not scheduler_host_has_port and DASK_SCHEDULER_PORT: self._dask_client_address += ":" + DASK_SCHEDULER_PORT else: # use to port 8786 by default if not specified self._dask_client_address += ":8786" # sanity check try: self._dask_client = DaskClient(self._dask_client_address) except Exception as err: raise MsPASSError( "Runntime error: cannot create a dask client with: " + self._dask_client_address, "Fatal", )
class Client: """ A client-side representation of MSPASS. This is the only client users should use in MSPASS. The client manages all the other clients or instances. It creates and manages a Database client. It creates and manages a Global Hisotry Manager. It creates and manages a scheduler(spark/dask) For the address and port of each client/instances, we first check the user specified parameters, if not then serach the environment varibales values, if not againm then use the default settings. """ def __init__( self, database_host=None, scheduler=None, scheduler_host=None, job_name="mspass", database_name="mspass", schema=None, collection=None, ): # job_name should be a string if database_host is not None and not type(database_host) is str: raise MsPASSError( "database_host should be a string but " + str(type(database_host)) + " is found.", "Fatal", ) if scheduler is not None and scheduler != "dask" and scheduler != "spark": raise MsPASSError( "scheduler should be either dask or spark but " + str(scheduler) + " is found.", "Fatal", ) if scheduler_host is not None and not type(scheduler_host) is str: raise MsPASSError( "scheduler_host should be a string but " + str(type(scheduler_host)) + " is found.", "Fatal", ) if job_name is not None and not type(job_name) is str: raise MsPASSError( "job_name should be a string but " + str(type(job_name)) + " is found.", "Fatal", ) if database_name is not None and not type(database_name) is str: raise MsPASSError( "database_name should be a string but " + str(type(database_name)) + " is found.", "Fatal", ) # collection should be a string if collection is not None and type(collection) is not str: raise MsPASSError( "collection should be a string but " + str(type(collection)) + " is found.", "Fatal", ) # check env variables MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS") MONGODB_PORT = os.environ.get("MONGODB_PORT") MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER") MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS") DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT") SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT") # create a database client # priority: parameter -> env -> default database_host_has_port = False if database_host: database_address = database_host # check if database_host contains port number already if ":" in database_address: database_host_has_port = True elif MSPASS_DB_ADDRESS: database_address = MSPASS_DB_ADDRESS else: database_address = "localhost" # add port if not database_host_has_port and MONGODB_PORT: database_address += ":" + MONGODB_PORT try: self._db_client = DBClient(database_address) self._db_client.server_info() except Exception as err: raise MsPASSError( "Runntime error: cannot create a database client with: " + database_address, "Fatal", ) # set default database name self._default_database_name = database_name self._default_schema = schema self._default_collection = collection # create a Global History Manager if schema: global_history_manager_db = Database(self._db_client, database_name, db_schema=schema) else: global_history_manager_db = Database(self._db_client, database_name) self._global_history_manager = GlobalHistoryManager( global_history_manager_db, job_name, collection=collection) # set scheduler if scheduler: self._scheduler = scheduler elif MSPASS_SCHEDULER: self._scheduler = MSPASS_SCHEDULER else: self._scheduler = "dask" # scheduler configuration if self._scheduler == "spark": scheduler_host_has_port = False if scheduler_host: self._spark_master_url = scheduler_host # add spark:// prefix if not exist if "spark://" not in scheduler_host: self._spark_master_url = "spark://" + self._spark_master_url # check if spark host address contains port number already if self._spark_master_url.count(":") == 2: scheduler_host_has_port = True elif MSPASS_SCHEDULER_ADDRESS: self._spark_master_url = MSPASS_SCHEDULER_ADDRESS # add spark:// prefix if not exist if "spark://" not in MSPASS_SCHEDULER_ADDRESS: self._spark_master_url = "spark://" + self._spark_master_url else: self._spark_master_url = "local" # add port number # 1. not the default 'local' # 2. scheduler_host and does not contain port number # 3. SPARK_MASTER_PORT exists if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS) and not scheduler_host_has_port and SPARK_MASTER_PORT): self._spark_master_url += ":" + SPARK_MASTER_PORT # sanity check try: spark = (SparkSession.builder.appName("mspass").master( self._spark_master_url).getOrCreate()) self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( "Runntime error: cannot create a spark configuration with: " + self._spark_master_url, "Fatal", ) elif self._scheduler == "dask": # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS: self._dask_client = DaskClient() else: scheduler_host_has_port = False # set host if scheduler_host: self._dask_client_address = scheduler_host # check if scheduler_host contains port number already if ":" in scheduler_host: scheduler_host_has_port = True else: self._dask_client_address = MSPASS_SCHEDULER_ADDRESS # add port if not scheduler_host_has_port and DASK_SCHEDULER_PORT: self._dask_client_address += ":" + DASK_SCHEDULER_PORT else: # use to port 8786 by default if not specified self._dask_client_address += ":8786" # sanity check try: self._dask_client = DaskClient(self._dask_client_address) except Exception as err: raise MsPASSError( "Runntime error: cannot create a dask client with: " + self._dask_client_address, "Fatal", ) def get_database_client(self): """ Get the database client in the global history manager :return: :class:`mspasspy.db.database.Database` """ return self._db_client def get_database(self, database_name=None): """ Get a database by database_name, if database_name is not specified, use the default one :param database_name: the name of database :type database_name: :class:`str` :return: :class:`mspasspy.db.database.Database` """ if not database_name: return Database(self._db_client, self._default_database_name) return Database(self._db_client, database_name) def get_global_history_manager(self): """ Get the global history manager with this client :return: :class:`mspasspy.global_history.manager.GlobalHistoryManager` """ return self._global_history_manager def get_scheduler(self): """ Get the scheduler(spark/dask) with this client :return: :class:`pyspark.SparkContext`/:class:`dask.distributed.Client` """ if self._scheduler == "spark": return self._spark_context else: return self._dask_client def set_database_client(self, database_host, database_port=None): """ Set a database client by database_host(and database_port) :param database_host: the host address of database client :type database_host: :class:`str` :param database_port: the port of database client :type database_port: :class:`str` """ database_host_has_port = False database_address = database_host # check if port is already in the database_host address if ":" in database_host: database_host_has_port = True # add port if not database_host_has_port and database_port: database_address += ":" + database_port # sanity check temp_db_client = self._db_client try: self._db_client = DBClient(database_address) self._db_client.server_info() except Exception as err: # restore the _db_client self._db_client = temp_db_client raise MsPASSError( "Runntime error: cannot create a database client with: " + database_address, "Fatal", ) def set_global_history_manager(self, history_db, job_name, collection=None): """ Set a global history manager by history_db, job_name(and collection) :param history_db: the database will be set in the global history manager :type history_db: :class:`mspasspy.db.database.Database` :param job_name: the job name will be set in the global history manager :type job_name: :class:`str` :param collection: the collection name will be set in the history_db :type collection: :class:`str` """ if not isinstance(history_db, Database): raise TypeError( "history_db should be a mspasspy.db.Database but " + str(type(history_db)) + " is found.") if not type(job_name) is str: raise TypeError("job_name should be a string but " + str(type(job_name)) + " is found.") if collection is not None and type(collection) is not str: raise TypeError("collection should be a string but " + str(type(collection)) + " is found.") self._global_history_manager = GlobalHistoryManager( history_db, job_name, collection=collection) def set_scheduler(self, scheduler, scheduler_host, scheduler_port=None): """ Set a scheduler by scheduler type, scheduler_host(and scheduler_port) :param scheduler: the scheduler type, should be either dask or spark :type scheduler: :class:`str` :param scheduler_host: the host address of scheduler :type scheduler_host: :class:`str` :param scheduler_port: the port of scheduler :type scheduler_port: :class:`str` """ if scheduler != "dask" and scheduler != "spark": raise MsPASSError( "scheduler should be either dask or spark but " + str(scheduler) + " is found.", "Fatal", ) prev_scheduler = self._scheduler self._scheduler = scheduler if scheduler == "spark": scheduler_host_has_port = False self._spark_master_url = scheduler_host # add spark:// prefix if not exist if "spark://" not in scheduler_host: self._spark_master_url = "spark://" + self._spark_master_url # check if spark host address contains port number already if self._spark_master_url.count(":") == 2: scheduler_host_has_port = True # add port if not scheduler_host_has_port and scheduler_port: self._spark_master_url += ":" + scheduler_port # sanity check prev_spark_context = None prev_spark_conf = None if hasattr(self, "_spark_context"): prev_spark_context = self._spark_context prev_spark_conf = self._spark_context.getConf() try: if hasattr(self, "_spark_context") and isinstance( self._spark_context, SparkContext): # update the confinguration spark_conf = self._spark_context._conf.setMaster( self._spark_master_url) else: spark_conf = (SparkConf().setAppName("mspass").setMaster( self._spark_master_url)) # stop the previous spark context # FIXME if the new context does not start, we shouldn't stop the previous here. # if prev_spark_context: # prev_spark_context.stop() # create a new spark context -> might cause error so that execute exception code spark = SparkSession.builder.config( conf=spark_conf).getOrCreate() self._spark_context = spark.sparkContext except Exception as err: # restore the spark context by the previous spark configuration if prev_spark_conf: self._spark_context = SparkContext.getOrCreate( conf=prev_spark_conf) # restore the scheduler type if self._scheduler == "spark" and prev_scheduler == "dask": self._scheduler = prev_scheduler raise MsPASSError( "Runntime error: cannot create a spark configuration with: " + self._spark_master_url, "Fatal", ) # close previous dask client if success if hasattr(self, "_dask_client"): del self._dask_client elif scheduler == "dask": scheduler_host_has_port = False self._dask_client_address = scheduler_host # check if scheduler_host contains port number already if ":" in scheduler_host: scheduler_host_has_port = True # add port if not scheduler_host_has_port: if scheduler_port: self._dask_client_address += ":" + scheduler_port else: # use to port 8786 by default if not specified self._dask_client_address += ":8786" # sanity check prev_dask_client = None if hasattr(self, "_dask_client"): prev_dask_client = self._dask_client try: # create a new dask client self._dask_client = DaskClient(self._dask_client_address) except Exception as err: # restore the dask client if exists if prev_dask_client: self._dask_client = prev_dask_client # restore the scheduler type if self._scheduler == "dask" and prev_scheduler == "spark": self._scheduler = prev_scheduler raise MsPASSError( "Runntime error: cannot create a dask client with: " + self._dask_client_address, "Fatal", ) # remove previous spark context if success setting new dask client if hasattr(self, "_spark_context"): del self._spark_context
def main(args=None): # As a script that would be run from the shell we let # any functions below that throw exception do so and assume they # will write a message that can help debug what went wrong if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser( prog="dbverify", usage= "%(prog)s dbname [-t TEST -c [collection ...] -n [normalize ... ] -error_limit n -v]", description="MsPASS database verify program", ) parser.add_argument( "dbname", metavar="dbname", type=str, help="MongoDB database name on which to run tests", ) parser.add_argument( "-t", "--test", action="store", type=str, default="normalization", help="Select which test to run. " + "Current options: normalization, required, schema_check", ) parser.add_argument( "-c", "--collection", action="store", nargs="*", default=["wf_TimeSeries"], help="Collection(s) on which the test is to be run. " + "Only schema_check supports multiple collections in one run", ) parser.add_argument( "-n", "--normalize", nargs="*", default=["site_id", "channel_id", "source_id"], help="List of normalization keys to test\n" + "(Used only for -test normalization option", ) parser.add_argument( "-r", "--require", nargs="*", default=[], help="List of keys of required attributes for required test", ) parser.add_argument( "-e", "--error_limit", action="store", type=int, default=1000, help="Set error limit - stop checking when this many errors are found\n" + "Default is 1000", ) parser.add_argument( "-v", "--verbose", action="store_true", help= "When used print offending values. Otherwise just return a summary", ) args = parser.parse_args(args) test_to_run = args.test dbname = args.dbname dbclient = DBClient() db = Database(dbclient, dbname) col_to_test = args.collection normalize = args.normalize reqlist = args.require verbose = args.verbose elimit = args.error_limit # If python had a switch case it would be used here. this # is the list of known tests. the program can only run one # test per execution. Intentional to make output more readable if test_to_run == "normalization": if len(col_to_test) > 1: print( "WARNING: normalization test can only be run on one collection at a time" ) print("Parsed a list with the following contents: ", col_to_test) print("Running test on the first item in that list") col = col_to_test[0] if not isinstance(col, str): print("Invalid value parsed for -c option=", col) exit(-1) run_check_links(db, col, normalize, elimit, verbose) elif test_to_run == "required": if len(col_to_test) > 1: print( "WARNING: required test can only be run on one collection at a time" ) print("Parsed a list with the following contents: ", col_to_test) print("Running test on the first item in that list") col = col_to_test[0] if not isinstance(col, str): print("Invalid value parsed for -c option=", col_to_test) exit(-1) if len(reqlist) == 0: # Depends on default being an empty list. For default # case run this small function. # This is currently a funtion above with const list values # returned for each known collection. It may eventually # be replaced a function using the schema required_list = get_required(col) else: required_list = reqlist run_check_required(db, col, required_list, elimit, verbose) elif test_to_run == "schema_check": for col in col_to_test: run_check_attribute_types(db, col, elimit, verbose) else: print("Unrecognized value for --test value parsed=", test_to_run) print("Must be one of: normalization, required, or schema_check")