def setup_class(self): client = Client('localhost') self.db = Database(client, 'test_dbclean') self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db['site'].insert_one({ '_id': site_id, 'net': 'net', 'sta': 'sta', 'loc': 'loc', 'lat': 1.0, 'lon': 1.0, 'elev': 2.0, 'starttime': datetime.utcnow().timestamp(), 'endtime': datetime.utcnow().timestamp() }) self.db['channel'].insert_one({ '_id': channel_id, 'net': 'net1', 'sta': 'sta1', 'loc': 'loc1', 'chan': 'chan', 'lat': 1.1, 'lon': 1.1, 'elev': 2.1, 'starttime': datetime.utcnow().timestamp(), 'endtime': datetime.utcnow().timestamp(), 'edepth': 3.0, 'vang': 1.0, 'hang': 1.0 }) self.db['source'].insert_one({ '_id': source_id, 'lat': 1.2, 'lon': 1.2, 'time': datetime.utcnow().timestamp(), 'depth': 3.1, 'magnitude': 1.0 }) self.test_ts['site_id'] = site_id self.test_ts['source_id'] = source_id self.test_ts['channel_id'] = channel_id
def setup_class(self): client = DBClient("localhost") self.db = Database(client, "test_dbclean") self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db["site"].insert_one({ "_id": site_id, "net": "net", "sta": "sta", "loc": "loc", "lat": 1.0, "lon": 1.0, "elev": 2.0, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), }) self.db["channel"].insert_one({ "_id": channel_id, "net": "net1", "sta": "sta1", "loc": "loc1", "chan": "chan", "lat": 1.1, "lon": 1.1, "elev": 2.1, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), "edepth": 3.0, "vang": 1.0, "hang": 1.0, }) self.db["source"].insert_one({ "_id": source_id, "lat": 1.2, "lon": 1.2, "time": datetime.utcnow().timestamp(), "depth": 3.1, "magnitude": 1.0, }) self.test_ts["site_id"] = site_id self.test_ts["source_id"] = source_id self.test_ts["channel_id"] = channel_id
def get_database(self, database_name=None): """ Get a database by database_name, if database_name is not specified, use the default one :param database_name: the name of database :type database_name: :class:`str` :return: :class:`mspasspy.db.database.Database` """ if not database_name: return Database(self._db_client, self._default_database_name) return Database(self._db_client, database_name)
def setup_class(self): self.client = DBClient("localhost") self.client.drop_database("test_manager") db = Database(self.client, "test_manager") db["history_global"].drop_indexes() # clean up the database locally for col_name in db.list_collection_names(): db[col_name].delete_many({}) self.manager = GlobalHistoryManager(db, "test_job", collection="history_global")
def __getitem__(self, name): """ Get a database by name. Raises :class:`~pymongo.errors.InvalidName` if an invalid database name is used. :Parameters: - `name`: the name of the database to get """ return Database(self, name)
def test_get_alg_id(self): manager_db = Database(self.client, "test_manager") assert not self.manager.get_alg_id("aaa", "bbb") res = manager_db["history_global"].find_one({ "alg_name": "new_stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) assert (self.manager.get_alg_id( "new_stack", '{"object_history": "True", "alg_id": "3"}') == res["alg_id"])
def get_default_database(self, default=None, codec_options=None, read_preference=None, write_concern=None, read_concern=None): if self.__default_database_name is None and default is None: raise pymongo.errors.ConfigurationError( 'No default database name defined or provided.') return Database(self, self.__default_database_name or default, codec_options, read_preference, write_concern, read_concern)
def get_database(self, name=None, codec_options=None, read_preference=None, write_concern=None, read_concern=None): if name is None: if self.__default_database_name is None: raise pymongo.errors.ConfigurationError( 'No default database defined') name = self.__default_database_name return Database(self, name, codec_options, read_preference, write_concern, read_concern)
def test_object_history(self, spark_context): manager_db = Database(self.client, "test_manager") manager_db["history_global"].delete_many({}) manager_db["history_object"].delete_many({}) l = [get_live_timeseries() for i in range(2)] # add net, sta, chan, loc to avoid metadata serialization problem for i in range(2): l[i]["chan"] = "HHZ" l[i]["loc"] = "test_loc" l[i]["net"] = "test_net" l[i]["sta"] = "test_sta" spark_res = spark_map(l, self.manager, spark_context) assert manager_db["history_global"].count_documents( {"alg_name": "filter"}) == 1 res = manager_db["history_global"].find_one({"alg_name": "filter"}) alg_id = res["alg_id"] # check status of the mspass objects for ts in spark_res: assert ts.number_of_stages() == 1 assert ts.current_nodedata().algorithm == "filter" assert ts.current_nodedata().algid == str(alg_id) assert ts.is_volatile() save_res = manager_db.save_data(spark_res[0], alg_name="filter", alg_id=str(alg_id)) # hardcode net, sta, net, loc to avoid serialization problem here, they are readonly metadata keys -> non fatal keys = 4 assert save_res.live assert manager_db["history_object"].count_documents( {"alg_name": "filter"}) == 1 doc = manager_db["history_object"].find_one({"alg_name": "filter"}) assert doc assert doc["_id"] == spark_res[0].current_nodedata().uuid assert doc["wf_TimeSeries_id"] == spark_res[0]["_id"] assert doc["alg_id"] == str(alg_id) assert doc["alg_name"] == "filter"
def test_logging(self): alg_id = ObjectId() manager_db = Database(self.client, "test_manager") manager_db["history_global"].delete_many({}) self.manager.logging(alg_id, "test_alg_name", "test_parameter") res = manager_db["history_global"].find_one( {"job_name": self.manager.job_name}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "test_alg_name" assert res["alg_id"] == alg_id assert res["parameters"] == "test_parameter" assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 1) # clean up manager_db["history_global"].delete_many({})
def test_set_alg_name_and_parameters(self): manager_db = Database(self.client, "test_manager") assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 3) res = manager_db["history_global"].find_one({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) alg_id = res["alg_id"] self.manager.set_alg_name_and_parameters(alg_id, "test_alg_name", "test_parameters") assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 0) assert (manager_db["history_global"].count_documents({ "alg_name": "test_alg_name", "parameters": "test_parameters" }) == 3) res = manager_db["history_global"].find_one({ "alg_name": "test_alg_name", "parameters": "test_parameters" }) assert res["alg_id"] == alg_id
def main(args=None): # As a script that would be run from the shell we let # any functions below that throw exception do so and assume they # will write a message that can help debug what went wrong if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser( prog="dbverify", usage= "%(prog)s dbname [-t TEST -c [collection ...] -n [normalize ... ] -error_limit n -v]", description="MsPASS database verify program", ) parser.add_argument( "dbname", metavar="dbname", type=str, help="MongoDB database name on which to run tests", ) parser.add_argument( "-t", "--test", action="store", type=str, default="normalization", help="Select which test to run. " + "Current options: normalization, required, schema_check", ) parser.add_argument( "-c", "--collection", action="store", nargs="*", default=["wf_TimeSeries"], help="Collection(s) on which the test is to be run. " + "Only schema_check supports multiple collections in one run", ) parser.add_argument( "-n", "--normalize", nargs="*", default=["site_id", "channel_id", "source_id"], help="List of normalization keys to test\n" + "(Used only for -test normalization option", ) parser.add_argument( "-r", "--require", nargs="*", default=[], help="List of keys of required attributes for required test", ) parser.add_argument( "-e", "--error_limit", action="store", type=int, default=1000, help="Set error limit - stop checking when this many errors are found\n" + "Default is 1000", ) parser.add_argument( "-v", "--verbose", action="store_true", help= "When used print offending values. Otherwise just return a summary", ) args = parser.parse_args(args) test_to_run = args.test dbname = args.dbname dbclient = DBClient() db = Database(dbclient, dbname) col_to_test = args.collection normalize = args.normalize reqlist = args.require verbose = args.verbose elimit = args.error_limit # If python had a switch case it would be used here. this # is the list of known tests. the program can only run one # test per execution. Intentional to make output more readable if test_to_run == "normalization": if len(col_to_test) > 1: print( "WARNING: normalization test can only be run on one collection at a time" ) print("Parsed a list with the following contents: ", col_to_test) print("Running test on the first item in that list") col = col_to_test[0] if not isinstance(col, str): print("Invalid value parsed for -c option=", col) exit(-1) run_check_links(db, col, normalize, elimit, verbose) elif test_to_run == "required": if len(col_to_test) > 1: print( "WARNING: required test can only be run on one collection at a time" ) print("Parsed a list with the following contents: ", col_to_test) print("Running test on the first item in that list") col = col_to_test[0] if not isinstance(col, str): print("Invalid value parsed for -c option=", col_to_test) exit(-1) if len(reqlist) == 0: # Depends on default being an empty list. For default # case run this small function. # This is currently a funtion above with const list values # returned for each known collection. It may eventually # be replaced a function using the schema required_list = get_required(col) else: required_list = reqlist run_check_required(db, col, required_list, elimit, verbose) elif test_to_run == "schema_check": for col in col_to_test: run_check_attribute_types(db, col, elimit, verbose) else: print("Unrecognized value for --test value parsed=", test_to_run) print("Must be one of: normalization, required, or schema_check")
class TestDBVerify(): def setup_class(self): client = Client('localhost') self.db = Database(client, 'test_dbverify') self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db['site'].insert_one({ '_id': site_id, 'net': 'net', 'sta': 'sta', 'loc': 'loc', 'lat': 1.0, 'lon': 1.0, 'elev': 2.0, 'starttime': datetime.utcnow().timestamp(), 'endtime': datetime.utcnow().timestamp() }) self.db['channel'].insert_one({ '_id': channel_id, 'net': 'net1', 'sta': 'sta1', 'loc': 'loc1', 'chan': 'chan', 'lat': 1.1, 'lon': 1.1, 'elev': 2.1, 'starttime': datetime.utcnow().timestamp(), 'endtime': datetime.utcnow().timestamp(), 'edepth': 3.0, 'vang': 1.0, 'hang': 1.0 }) self.db['source'].insert_one({ '_id': source_id, 'lat': 1.2, 'lon': 1.2, 'time': datetime.utcnow().timestamp(), 'depth': 3.1, 'magnitude': 1.0 }) self.test_ts['site_id'] = site_id self.test_ts['source_id'] = source_id self.test_ts['channel_id'] = channel_id def test_main(self, capfd): self.db['wf_TimeSeries'].delete_many({}) ts1 = copy.deepcopy(self.test_ts) ts2 = copy.deepcopy(self.test_ts) ts3 = copy.deepcopy(self.test_ts) logging_helper.info(ts1, 'deepcopy', '1') logging_helper.info(ts2, 'deepcopy', '1') logging_helper.info(ts3, 'deepcopy', '1') # fix types ts1['npts'] = '123' ts1['extra1'] = 'extra1' ts2['delta'] = '3' ts2['extra2'] = 'extra2' ts3['npts'] = 'xyz' ts3['extra2'] = 'extra2' # wrong normalized key ts1['site_id'] = ObjectId() ts2.erase('source_id') save_res_code = self.db.save_data(ts1, mode='promiscuous', storage_mode='gridfs') save_res_code = self.db.save_data(ts2, mode='promiscuous', storage_mode='gridfs') # erase required attributes save_res_code = self.db.save_data(ts3, mode='promiscuous', storage_mode='gridfs', exclude_keys=['starttime']) doc1 = self.db['wf_TimeSeries'].find_one({'_id': ts1['_id']}) doc2 = self.db['wf_TimeSeries'].find_one({'_id': ts2['_id']}) doc3 = self.db['wf_TimeSeries'].find_one({'_id': ts3['_id']}) doc1_str = json_util.dumps(doc1, indent=2) doc2_str = json_util.dumps(doc2, indent=2) doc3_str = json_util.dumps(doc3, indent=2) # default normalization test dbverify.main(['test_dbverify', '-t', 'normalization']) out, err = capfd.readouterr() assert out == "normalization test on normalized key= site_id found problems\nFound broken links in 1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n" # more than 1 collection to test dbverify.main([ 'test_dbverify', '-t', 'normalization', '-c', 'wf_TimeSeries', 'site' ]) out, err = capfd.readouterr() assert out == "WARNING: normalization test can only be run on one collection at a time\nParsed a list with the following contents: ['wf_TimeSeries', 'site']\nRunning test on the first item in that list\nnormalization test on normalized key= site_id found problems\nFound broken links in 1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n" # verbose mode dbverify.main(['test_dbverify', '-t', 'normalization', '-v']) out, err = capfd.readouterr() assert out == "check_link found the following docs in wf_TimeSeries with broken links to site_id\n////////////////Doc number 1 with error///////////////\n" + doc1_str + "\n////////////////////////////////////////////////////////\ncheck_links found no undefined linking key to normalized key= site_id\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no undefined linking key to normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\ncheck_link found the following docs in wf_TimeSeries with undefined link keys to source_id\n////////////////Doc number 1 with error///////////////\n" + doc2_str + "\n////////////////////////////////////////////////////////\n" # default required test dbverify.main(['test_dbverify', '-t', 'required']) out, err = capfd.readouterr() mmkeys = {'npts': 2, 'delta': 1} mm_keys_str = json_util.dumps(mmkeys, indent=2) undef_keys = {'starttime': 1} undef_keys_str = json_util.dumps(undef_keys, indent=2) assert out == "////Results from run_check_required on collection= wf_TimeSeries\nCollection found 3 documents with type inconsistencies\nOffending keys and number found follow:\n" + mm_keys_str + "\nCollection found 1 documents with required keys that were not defined\nOffending keys and number found follow:\n" + undef_keys_str + "\n" # default schema_check test dbverify.main(['test_dbverify', '-t', 'schema_check']) out, err = capfd.readouterr() mmkeys = {'npts': 2, 'delta': 1} mm_keys_str = json_util.dumps(mmkeys, indent=2) undef_keys = {'extra1': 1, 'extra2': 2} undef_keys_str = json_util.dumps(undef_keys, indent=2) assert out == "check_attribute_types result for collection= wf_TimeSeries\nCollection found 3 documents with type inconsistencies\nOffending keys and number found follow:\n" + mm_keys_str + "\nCollection found 3 documents with keys not defined in the schema\nOffending keys and number found follow:\n" + undef_keys_str + "\n"
def test_mspass_map_with_filePath(self, spark_context): # test mapass_map for spark (file input) # data input of RFdecon, needed for parallelization d = [get_live_seismogram(71, 2.0) for i in range(5)] for i in range(5): d[i].t0 = -5 # parameters string pfPath = "python/mspasspy/data/pf/RFdeconProcessor.pf" pf = AntelopePf(pfPath) pf_dict = AntelopePf2dict(pf) parameter_dict = collections.OrderedDict() parameter_dict["alg"] = "LeastSquares" parameter_dict["pf"] = pf_dict parameter_dict["object_history"] = "True" gTree = ParameterGTree(parameter_dict) json_params = json.dumps(gTree.asdict()) data = spark_context.parallelize(d) spark_res = data.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=None, parameters=None, ).collect() manager_db = Database(self.client, "test_manager") assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 8) res = manager_db["history_global"].find_one({"alg_name": "RFdecon"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "RFdecon" assert res["parameters"] == json_params spark_alg_id = res["alg_id"] # test mspass_map for dask ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=None, parameters=None, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 9) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 2) docs = manager_db["history_global"].find({"alg_id": spark_alg_id}) assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id assert docs[0]["job_name"] == docs[1][ "job_name"] == self.manager.job_name assert docs[0]["alg_name"] == docs[1]["alg_name"] == "RFdecon" assert docs[0]["parameters"] == docs[1]["parameters"] == json_params assert not docs[0]["time"] == docs[1]["time"] # same alg + parameters combination -> same alg_id ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=None, parameters=None, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 10) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 3) # SPARK test user provided alg_name and parameter(exist) spark_alg_name = "RFdecon" spark_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) data = spark_context.parallelize(d) spark_res = data.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=spark_alg_name, parameters=spark_alg_parameters, ).collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 11) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 4) # SPARK test user provided alg_name and parameter(new) spark_alg_name = "RFdecon_2" spark_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) data = spark_context.parallelize(d) spark_res = data.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=spark_alg_name, parameters=spark_alg_parameters, ).collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 12) assert (manager_db["history_global"].count_documents( {"alg_name": "RFdecon_2"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "RFdecon_2"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "RFdecon_2" assert res["parameters"] == json_params new_spark_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_spark_alg_id}) == 1) # DASK test user provided alg_name and parameter(exist) dask_alg_name = "RFdecon" dask_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 13) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 5) # DASK test user provided alg_name and parameter(new) dask_alg_name = "RFdecon_3" dask_alg_parameters = ( "alg=LeastSquares, pf={pfPath}, object_history=True".format( pfPath=pfPath)) ddb = daskbag.from_sequence(d) dask_res = ddb.mspass_map( RFdecon, alg="LeastSquares", pf=pfPath, object_history=True, global_history=self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters, ).compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 14) assert (manager_db["history_global"].count_documents( {"alg_name": "RFdecon_3"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "RFdecon_3"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "RFdecon_3" assert res["parameters"] == json_params new_dask_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_dask_alg_id}) == 1)
def test_mspass_reduce(self, spark_context): manager_db = Database(self.client, "test_manager") manager_db["history_global"].delete_many({}) l = [get_live_timeseries() for i in range(5)] # test mspass_reduce for spark spark_res = spark_reduce(l, self.manager, spark_context) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 1) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 1 res = manager_db["history_global"].find_one({"alg_name": "stack"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}' spark_alg_id = res["alg_id"] # test mspass_reduce for dask dask_res = dask_reduce(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 2) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 2 assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 1) docs = manager_db["history_global"].find({"alg_name": "stack"}) for doc in docs: if doc["alg_id"] == spark_alg_id: continue res = doc assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}' # different alg -> different alg_id assert not res["alg_id"] == spark_alg_id dask_alg_id = res["alg_id"] # same alg + parameters combination -> same alg_id dask_res = dask_reduce(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 3) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 3 assert (manager_db["history_global"].count_documents( {"alg_id": dask_alg_id}) == 2) docs = manager_db["history_global"].find({"alg_id": dask_alg_id}) doc1 = docs[0] doc2 = docs[1] assert not doc1["time"] == doc2["time"] assert doc1["job_id"] == doc2["job_id"] assert doc1["job_name"] == doc2["job_name"] assert doc1["alg_name"] == doc2["alg_name"] assert doc1["parameters"] == doc2["parameters"] # SPARK test user provided alg_name and parameter(exist) spark_alg_name = "stack" spark_alg_parameters = "object_history=True,alg_id=2" spark_res = spark_reduce( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 4) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 4 assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 2) # SPARK test user provided alg_name and parameter(new) spark_alg_name = "new_stack" spark_alg_parameters = "object_history=True,alg_id=2" spark_res = spark_reduce( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 5) assert (manager_db["history_global"].count_documents( {"alg_name": "new_stack"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "new_stack"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}' # DASK test user provided alg_name and parameter(exist) dask_alg_name = "stack" dask_alg_parameters = "object_history=True,alg_id=3" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 6) assert manager_db["history_global"].count_documents( {"alg_name": "stack"}) == 5 assert (manager_db["history_global"].count_documents({ "alg_name": "stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 3) # DASK test user provided alg_name and parameter(new) dask_alg_name = "new_stack" dask_alg_parameters = "object_history=True,alg_id=3" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 7) assert (manager_db["history_global"].count_documents({ "alg_name": "new_stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) == 1) res = manager_db["history_global"].find_one({ "alg_name": "new_stack", "parameters": '{"object_history": "True", "alg_id": "3"}', }) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_stack" assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}'
def test_mspass_map(self, spark_context): l = [get_live_timeseries() for i in range(5)] # add net, sta, chan, loc to avoid metadata serialization problem for i in range(5): l[i]["chan"] = "HHZ" l[i]["loc"] = "test_loc" l[i]["net"] = "test_net" l[i]["sta"] = "test_sta" l[i].set_as_origin("test", "0", str(i), AtomicType.TIMESERIES) # test mspass_map for spark spark_res = spark_map(l, self.manager, spark_context) manager_db = Database(self.client, "test_manager") assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 1) res = manager_db["history_global"].find_one( {"job_name": self.manager.job_name}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "filter" assert ( res["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) spark_alg_id = res["alg_id"] # test mspass_map for dask dask_res = dask_map(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 2) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 2) docs = manager_db["history_global"].find({"alg_id": spark_alg_id}) assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id assert docs[0]["job_name"] == docs[1][ "job_name"] == self.manager.job_name assert docs[0]["alg_name"] == docs[1]["alg_name"] == "filter" assert ( docs[0]["parameters"] == docs[1]["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) assert not docs[0]["time"] == docs[1]["time"] # same alg + parameters combination -> same alg_id dask_res = dask_map(l, self.manager) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 3) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 3) # SPARK test user provided alg_name and parameter(exist) spark_alg_name = "filter" spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" spark_res = spark_map( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 4) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 4) # SPARK test user provided alg_name and parameter(new) spark_alg_name = "new_filter" spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" spark_res = spark_map( l, self.manager, spark_context, alg_name=spark_alg_name, parameters=spark_alg_parameters, ) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 5) assert (manager_db["history_global"].count_documents( {"alg_name": "new_filter"}) == 1) res = manager_db["history_global"].find_one({"alg_name": "new_filter"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_filter" assert ( res["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) new_spark_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_spark_alg_id}) == 1) # DASK test user provided alg_name and parameter(exist) dask_alg_name = "filter" dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 6) assert (manager_db["history_global"].count_documents( {"alg_id": spark_alg_id}) == 5) # DASK test user provided alg_name and parameter(new) dask_alg_name = "new_filter_2" dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True" dask_res = dask_map(l, self.manager, alg_name=dask_alg_name, parameters=dask_alg_parameters) assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 7) assert (manager_db["history_global"].count_documents( {"alg_name": "new_filter_2"}) == 1) res = manager_db["history_global"].find_one( {"alg_name": "new_filter_2"}) assert res["job_id"] == self.manager.job_id assert res["job_name"] == self.manager.job_name assert res["alg_name"] == "new_filter_2" assert ( res["parameters"] == '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}' ) new_dask_alg_id = res["alg_id"] assert (manager_db["history_global"].count_documents( {"alg_id": new_dask_alg_id}) == 1) manager_db["history_object"].delete_many({}) # test spark mspass_map for save_data data = spark_context.parallelize(l) data_map = data.mspass_map(manager_db.save_data, global_history=self.manager) save_list = data_map.collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 8) assert (manager_db["history_global"].count_documents( {"alg_name": "save_data"}) == 1) # check object history after save_data manager_db["history_object"].count_documents({}) == 5 manager_db["wf_TimeSeries"].count_documents({}) == 5 history_object_docs = manager_db["history_object"].find({}) idx = 0 doc_alg_id = None doc_ids = [] for doc in history_object_docs: if not doc_alg_id: doc_alg_id = doc["alg_id"] else: assert doc_alg_id == doc["alg_id"] doc_ids.append(doc["_id"]) assert doc["alg_name"] == "save_data" idx += 1 assert sorted(doc_ids) == ["0", "1", "2", "3", "4"] # test spark mspass_map for read_data save_l = [res[1] for res in save_list] data = spark_context.parallelize(save_l) data_map = data.mspass_map(manager_db.read_data, global_history=self.manager) read_list = data_map.collect() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 9) assert (manager_db["history_global"].count_documents( {"alg_name": "read_data"}) == 1) manager_db["history_object"].delete_many({}) manager_db["wf_TimeSeries"].delete_many({}) # test dask mspass_map for save_data data = daskbag.from_sequence(l) data_map = data.mspass_map(manager_db.save_data, global_history=self.manager) save_list = data_map.compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 10) assert (manager_db["history_global"].count_documents( {"alg_name": "save_data"}) == 2) res = manager_db["history_global"].find({"alg_name": "save_data"}) assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name assert res[0]["alg_name"] == res[1]["alg_name"] == "save_data" assert (res[0]["parameters"] == res[1]["parameters"] == '{"object_history": "False"}') assert res[0]["alg_id"] == res[1]["alg_id"] # check object history after save_data manager_db["history_object"].count_documents({}) == 5 manager_db["wf_TimeSeries"].count_documents({}) == 5 history_object_docs = manager_db["history_object"].find({}) idx = 0 doc_alg_id = None doc_ids = [] for doc in history_object_docs: if not doc_alg_id: doc_alg_id = doc["alg_id"] else: assert doc_alg_id == doc["alg_id"] doc_ids.append(doc["_id"]) assert doc["alg_name"] == "save_data" idx += 1 assert sorted(doc_ids) == ["0", "1", "2", "3", "4"] # test dask mspass_map for read_data save_l = [res[1] for res in save_list] data = daskbag.from_sequence(save_l) data_map = data.mspass_map(manager_db.read_data, global_history=self.manager) read_list = data_map.compute() assert (manager_db["history_global"].count_documents( {"job_name": self.manager.job_name}) == 11) assert (manager_db["history_global"].count_documents( {"alg_name": "read_data"}) == 2) res = manager_db["history_global"].find({"alg_name": "read_data"}) assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name assert res[0]["alg_name"] == res[1]["alg_name"] == "read_data" assert (res[0]["parameters"] == res[1]["parameters"] == '{"object_history": "False"}') assert res[0]["alg_id"] == res[1]["alg_id"]
class TestDBClean(): def setup_class(self): client = Client('localhost') self.db = Database(client, 'test_dbclean') self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db['site'].insert_one({ '_id': site_id, 'net': 'net', 'sta': 'sta', 'loc': 'loc', 'lat': 1.0, 'lon': 1.0, 'elev': 2.0, 'starttime': datetime.utcnow().timestamp(), 'endtime': datetime.utcnow().timestamp() }) self.db['channel'].insert_one({ '_id': channel_id, 'net': 'net1', 'sta': 'sta1', 'loc': 'loc1', 'chan': 'chan', 'lat': 1.1, 'lon': 1.1, 'elev': 2.1, 'starttime': datetime.utcnow().timestamp(), 'endtime': datetime.utcnow().timestamp(), 'edepth': 3.0, 'vang': 1.0, 'hang': 1.0 }) self.db['source'].insert_one({ '_id': source_id, 'lat': 1.2, 'lon': 1.2, 'time': datetime.utcnow().timestamp(), 'depth': 3.1, 'magnitude': 1.0 }) self.test_ts['site_id'] = site_id self.test_ts['source_id'] = source_id self.test_ts['channel_id'] = channel_id def test_rename_list_to_dict(self): rlist = ['a:1', 'b:2', 'c:3'] result = dbclean.rename_list_to_dict(rlist) assert len(result) == 3 assert result == {'a': '1', 'b': '2', 'c': '3'} rlist = ['a:1:2'] with pytest.raises(SystemExit) as e: dbclean.rename_list_to_dict(rlist) assert e.type == SystemExit assert e.value.code == -1 def test_main(self): self.db['wf_TimeSeries'].delete_many({}) ts1 = copy.deepcopy(self.test_ts) ts2 = copy.deepcopy(self.test_ts) ts3 = copy.deepcopy(self.test_ts) logging_helper.info(ts1, 'deepcopy', '1') logging_helper.info(ts2, 'deepcopy', '1') logging_helper.info(ts3, 'deepcopy', '1') # fix types ts1['npts'] = '123' ts2['delta'] = '3' ts3['npts'] = 'xyz' save_res_code = self.db.save_data(ts1, mode='promiscuous', storage_mode='gridfs', exclude_keys=['extra2']) save_res_code = self.db.save_data(ts2, mode='promiscuous', storage_mode='gridfs', exclude_keys=['extra2']) save_res_code = self.db.save_data(ts3, mode='promiscuous', storage_mode='gridfs', exclude_keys=['extra2']) # exit with pytest.raises(SystemExit) as e: dbclean.main(['test_dbclean', 'wf_TimeSeries']) assert e.type == SystemExit assert e.value.code == -1 # delete starttime attribute # rename calib to rename_calib dbclean.main([ 'test_dbclean', 'wf_TimeSeries', '-ft', '-d', 'starttime', '-r', 'calib:rename_calib' ]) res1 = self.db['wf_TimeSeries'].find_one({'_id': ts1['_id']}) res2 = self.db['wf_TimeSeries'].find_one({'_id': ts2['_id']}) res3 = self.db['wf_TimeSeries'].find_one({'_id': ts3['_id']}) assert res1['npts'] == 123 assert 'starttime' not in res1 assert 'calib' not in res1 assert 'rename_calib' in res1 assert res2['delta'] == 3.0 assert 'starttime' not in res2 assert 'calib' not in res2 assert 'rename_calib' in res2 # can't be fixed assert res3['npts'] == 'xyz' assert 'starttime' not in res3 assert 'calib' not in res3 assert 'rename_calib' in res3 self.db['wf_TimeSeries'].delete_many({}) ts1 = copy.deepcopy(self.test_ts) ts2 = copy.deepcopy(self.test_ts) ts3 = copy.deepcopy(self.test_ts) logging_helper.info(ts1, 'deepcopy', '1') logging_helper.info(ts2, 'deepcopy', '1') logging_helper.info(ts3, 'deepcopy', '1') # fix types ts1['npts'] = '123' ts2['delta'] = '3' ts3['npts'] = 'xyz' save_res_code = self.db.save_data(ts1, mode='promiscuous', storage_mode='gridfs', exclude_keys=['extra2']) save_res_code = self.db.save_data(ts2, mode='promiscuous', storage_mode='gridfs', exclude_keys=['extra2']) save_res_code = self.db.save_data(ts3, mode='promiscuous', storage_mode='gridfs', exclude_keys=['extra2']) # only fix types dbclean.main(['test_dbclean', 'wf_TimeSeries', '-ft']) assert res1['npts'] == 123 assert res2['delta'] == 3.0 # can't be fixed assert res3['npts'] == 'xyz'
def __init__( self, database_host=None, scheduler=None, scheduler_host=None, job_name="mspass", database_name="mspass", schema=None, collection=None, ): # job_name should be a string if database_host is not None and not type(database_host) is str: raise MsPASSError( "database_host should be a string but " + str(type(database_host)) + " is found.", "Fatal", ) if scheduler is not None and scheduler != "dask" and scheduler != "spark": raise MsPASSError( "scheduler should be either dask or spark but " + str(scheduler) + " is found.", "Fatal", ) if scheduler_host is not None and not type(scheduler_host) is str: raise MsPASSError( "scheduler_host should be a string but " + str(type(scheduler_host)) + " is found.", "Fatal", ) if job_name is not None and not type(job_name) is str: raise MsPASSError( "job_name should be a string but " + str(type(job_name)) + " is found.", "Fatal", ) if database_name is not None and not type(database_name) is str: raise MsPASSError( "database_name should be a string but " + str(type(database_name)) + " is found.", "Fatal", ) # collection should be a string if collection is not None and type(collection) is not str: raise MsPASSError( "collection should be a string but " + str(type(collection)) + " is found.", "Fatal", ) # check env variables MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS") MONGODB_PORT = os.environ.get("MONGODB_PORT") MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER") MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS") DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT") SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT") # create a database client # priority: parameter -> env -> default database_host_has_port = False if database_host: database_address = database_host # check if database_host contains port number already if ":" in database_address: database_host_has_port = True elif MSPASS_DB_ADDRESS: database_address = MSPASS_DB_ADDRESS else: database_address = "localhost" # add port if not database_host_has_port and MONGODB_PORT: database_address += ":" + MONGODB_PORT try: self._db_client = DBClient(database_address) self._db_client.server_info() except Exception as err: raise MsPASSError( "Runntime error: cannot create a database client with: " + database_address, "Fatal", ) # set default database name self._default_database_name = database_name self._default_schema = schema self._default_collection = collection # create a Global History Manager if schema: global_history_manager_db = Database(self._db_client, database_name, db_schema=schema) else: global_history_manager_db = Database(self._db_client, database_name) self._global_history_manager = GlobalHistoryManager( global_history_manager_db, job_name, collection=collection) # set scheduler if scheduler: self._scheduler = scheduler elif MSPASS_SCHEDULER: self._scheduler = MSPASS_SCHEDULER else: self._scheduler = "dask" # scheduler configuration if self._scheduler == "spark": scheduler_host_has_port = False if scheduler_host: self._spark_master_url = scheduler_host # add spark:// prefix if not exist if "spark://" not in scheduler_host: self._spark_master_url = "spark://" + self._spark_master_url # check if spark host address contains port number already if self._spark_master_url.count(":") == 2: scheduler_host_has_port = True elif MSPASS_SCHEDULER_ADDRESS: self._spark_master_url = MSPASS_SCHEDULER_ADDRESS # add spark:// prefix if not exist if "spark://" not in MSPASS_SCHEDULER_ADDRESS: self._spark_master_url = "spark://" + self._spark_master_url else: self._spark_master_url = "local" # add port number # 1. not the default 'local' # 2. scheduler_host and does not contain port number # 3. SPARK_MASTER_PORT exists if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS) and not scheduler_host_has_port and SPARK_MASTER_PORT): self._spark_master_url += ":" + SPARK_MASTER_PORT # sanity check try: spark = (SparkSession.builder.appName("mspass").master( self._spark_master_url).getOrCreate()) self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( "Runntime error: cannot create a spark configuration with: " + self._spark_master_url, "Fatal", ) elif self._scheduler == "dask": # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS: self._dask_client = DaskClient() else: scheduler_host_has_port = False # set host if scheduler_host: self._dask_client_address = scheduler_host # check if scheduler_host contains port number already if ":" in scheduler_host: scheduler_host_has_port = True else: self._dask_client_address = MSPASS_SCHEDULER_ADDRESS # add port if not scheduler_host_has_port and DASK_SCHEDULER_PORT: self._dask_client_address += ":" + DASK_SCHEDULER_PORT else: # use to port 8786 by default if not specified self._dask_client_address += ":8786" # sanity check try: self._dask_client = DaskClient(self._dask_client_address) except Exception as err: raise MsPASSError( "Runntime error: cannot create a dask client with: " + self._dask_client_address, "Fatal", )
class TestDBClean: def setup_class(self): client = DBClient("localhost") self.db = Database(client, "test_dbclean") self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db["site"].insert_one({ "_id": site_id, "net": "net", "sta": "sta", "loc": "loc", "lat": 1.0, "lon": 1.0, "elev": 2.0, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), }) self.db["channel"].insert_one({ "_id": channel_id, "net": "net1", "sta": "sta1", "loc": "loc1", "chan": "chan", "lat": 1.1, "lon": 1.1, "elev": 2.1, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), "edepth": 3.0, "vang": 1.0, "hang": 1.0, }) self.db["source"].insert_one({ "_id": source_id, "lat": 1.2, "lon": 1.2, "time": datetime.utcnow().timestamp(), "depth": 3.1, "magnitude": 1.0, }) self.test_ts["site_id"] = site_id self.test_ts["source_id"] = source_id self.test_ts["channel_id"] = channel_id def test_rename_list_to_dict(self): rlist = ["a:1", "b:2", "c:3"] result = dbclean.rename_list_to_dict(rlist) assert len(result) == 3 assert result == {"a": "1", "b": "2", "c": "3"} rlist = ["a:1:2"] with pytest.raises(SystemExit) as e: dbclean.rename_list_to_dict(rlist) assert e.type == SystemExit assert e.value.code == -1 def test_main(self): self.db["wf_TimeSeries"].delete_many({}) ts1 = copy.deepcopy(self.test_ts) ts2 = copy.deepcopy(self.test_ts) ts3 = copy.deepcopy(self.test_ts) logging_helper.info(ts1, "1", "deepcopy") logging_helper.info(ts2, "1", "deepcopy") logging_helper.info(ts3, "1", "deepcopy") # fix types ts1["npts"] = "123" ts2["delta"] = "3" ts3["npts"] = "xyz" save_res_code = self.db.save_data(ts1, mode="promiscuous", storage_mode="gridfs", exclude_keys=["extra2"]) save_res_code = self.db.save_data(ts2, mode="promiscuous", storage_mode="gridfs", exclude_keys=["extra2"]) save_res_code = self.db.save_data(ts3, mode="promiscuous", storage_mode="gridfs", exclude_keys=["extra2"]) # exit with pytest.raises(SystemExit) as e: dbclean.main(["test_dbclean", "wf_TimeSeries"]) assert e.type == SystemExit assert e.value.code == -1 # delete starttime attribute # rename calib to rename_calib dbclean.main([ "test_dbclean", "wf_TimeSeries", "-ft", "-d", "starttime", "-r", "calib:rename_calib", ]) res1 = self.db["wf_TimeSeries"].find_one({"_id": ts1["_id"]}) res2 = self.db["wf_TimeSeries"].find_one({"_id": ts2["_id"]}) res3 = self.db["wf_TimeSeries"].find_one({"_id": ts3["_id"]}) assert res1["npts"] == 123 assert "starttime" not in res1 assert "calib" not in res1 assert "rename_calib" in res1 assert res2["delta"] == 3.0 assert "starttime" not in res2 assert "calib" not in res2 assert "rename_calib" in res2 # can't be fixed assert res3["npts"] == "xyz" assert "starttime" not in res3 assert "calib" not in res3 assert "rename_calib" in res3 self.db["wf_TimeSeries"].delete_many({}) ts1 = copy.deepcopy(self.test_ts) ts2 = copy.deepcopy(self.test_ts) ts3 = copy.deepcopy(self.test_ts) logging_helper.info(ts1, "1", "deepcopy") logging_helper.info(ts2, "1", "deepcopy") logging_helper.info(ts3, "1", "deepcopy") # fix types ts1["npts"] = "123" ts2["delta"] = "3" ts3["npts"] = "xyz" save_res_code = self.db.save_data(ts1, mode="promiscuous", storage_mode="gridfs", exclude_keys=["extra2"]) save_res_code = self.db.save_data(ts2, mode="promiscuous", storage_mode="gridfs", exclude_keys=["extra2"]) save_res_code = self.db.save_data(ts3, mode="promiscuous", storage_mode="gridfs", exclude_keys=["extra2"]) # only fix types dbclean.main(["test_dbclean", "wf_TimeSeries", "-ft"]) assert res1["npts"] == 123 assert res2["delta"] == 3.0 # can't be fixed assert res3["npts"] == "xyz"
def main(args=None): """ """ if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser( prog="dbclean", usage= "%(prog)s dbname collection [-ft] [-d k1 ...] [-r kold:knew ... ] [-v] [-h]", description="MsPASS program to fix most errors detected by dbverify", ) parser.add_argument("dbname", metavar="dbname", type=str, help="MongoDB database name to be fixed") parser.add_argument( "collection", metavar="collection", type=str, help="MongoDB collection name to be fixed", ) parser.add_argument( "-ft", "--fixtypes", action="store_true", help="Enable automatic type mismatch repair", ) parser.add_argument( "-d", "--delete", nargs="*", default=[], help="List of keys of key-value pairs to be deleted from all documents", ) parser.add_argument( "-r", "--rename", nargs="*", default=[], help= "Change the keys of documents using pattern defined in args of form oldkey:newkey", ) parser.add_argument( "-v", "--verbose", action="store_true", help="When used be echo each fix - default works silently", ) args = parser.parse_args(args) dbname = args.dbname collection = args.collection fixtypes = args.fixtypes delete = args.delete rename = args.rename verbose = args.verbose # not a very robust way to detect this condition but it should work # it is not robust because it assumes a behavior in argparse for # args with a list if len(delete) > 0: enable_deletion = True else: enable_deletion = False if len(rename) > 0: enable_rename = True else: enable_rename = False if not (fixtypes or enable_deletion or enable_rename): print("Usage error: you must define at least one clean operation") print("Type: dbclean --help to get usage help") exit(-1) if enable_rename: rename_map = rename_list_to_dict(rename) dbclient = DBClient() db = Database(dbclient, dbname) print("Starting processing of ", collection, " collection of database named=", dbname) # Intentionally do the delete and rename operations before # a type check to allow cleaning any keys. The set of dicts below # accumulate counts of edits for each key if enable_deletion: delcounts = db._delete_attributes(collection, delete, verbose=verbose) print("delete processing compeleted on collection=", collection) print("Number of documents changed for each key requested follow:") print(json_util.dumps(delcounts, indent=4)) if enable_rename: repcounts = db._rename_attributes(collection, rename_map, verbose=verbose) print("rename processing compeleted on collection=", collection) print("Here is the set of changes requested:") print(json_util.dumps(rename_map)) print("Number of documents changed for each key requested follow:") print(json_util.dumps(repcounts, indent=4)) if fixtypes: fixcounts = db._fix_attribute_types(collection, verbose=verbose) print("fixtype processing compeleted on collection=", collection) print("Keys of documents changed and number changed follow:") print(json_util.dumps(fixcounts, indent=4))
class TestDBVerify: def setup_class(self): client = DBClient("localhost") self.db = Database(client, "test_dbverify") self.test_ts = get_live_timeseries() site_id = ObjectId() channel_id = ObjectId() source_id = ObjectId() self.db["site"].insert_one( { "_id": site_id, "net": "net", "sta": "sta", "loc": "loc", "lat": 1.0, "lon": 1.0, "elev": 2.0, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), } ) self.db["channel"].insert_one( { "_id": channel_id, "net": "net1", "sta": "sta1", "loc": "loc1", "chan": "chan", "lat": 1.1, "lon": 1.1, "elev": 2.1, "starttime": datetime.utcnow().timestamp(), "endtime": datetime.utcnow().timestamp(), "edepth": 3.0, "vang": 1.0, "hang": 1.0, } ) self.db["source"].insert_one( { "_id": source_id, "lat": 1.2, "lon": 1.2, "time": datetime.utcnow().timestamp(), "depth": 3.1, "magnitude": 1.0, } ) self.test_ts["site_id"] = site_id self.test_ts["source_id"] = source_id self.test_ts["channel_id"] = channel_id def test_main(self, capfd): self.db["wf_TimeSeries"].delete_many({}) ts1 = copy.deepcopy(self.test_ts) ts2 = copy.deepcopy(self.test_ts) ts3 = copy.deepcopy(self.test_ts) logging_helper.info(ts1, "1", "deepcopy") logging_helper.info(ts2, "1", "deepcopy") logging_helper.info(ts3, "1", "deepcopy") # fix types ts1["npts"] = "123" ts1["extra1"] = "extra1" ts2["delta"] = "3" ts2["extra2"] = "extra2" ts3["npts"] = "xyz" ts3["extra2"] = "extra2" # wrong normalized key ts1["site_id"] = ObjectId() ts2.erase("source_id") save_res_code = self.db.save_data( ts1, mode="promiscuous", storage_mode="gridfs" ) save_res_code = self.db.save_data( ts2, mode="promiscuous", storage_mode="gridfs" ) # erase required attributes save_res_code = self.db.save_data( ts3, mode="promiscuous", storage_mode="gridfs", exclude_keys=["starttime"] ) doc1 = self.db["wf_TimeSeries"].find_one({"_id": ts1["_id"]}) doc2 = self.db["wf_TimeSeries"].find_one({"_id": ts2["_id"]}) doc3 = self.db["wf_TimeSeries"].find_one({"_id": ts3["_id"]}) doc1_str = json_util.dumps(doc1, indent=2) doc2_str = json_util.dumps(doc2, indent=2) doc3_str = json_util.dumps(doc3, indent=2) # default normalization test dbverify.main(["test_dbverify", "-t", "normalization"]) out, err = capfd.readouterr() assert ( out == "normalization test on normalized key= site_id found problems\nFound broken links in 1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n" ) # more than 1 collection to test dbverify.main( ["test_dbverify", "-t", "normalization", "-c", "wf_TimeSeries", "site"] ) out, err = capfd.readouterr() assert ( out == "WARNING: normalization test can only be run on one collection at a time\nParsed a list with the following contents: ['wf_TimeSeries', 'site']\nRunning test on the first item in that list\nnormalization test on normalized key= site_id found problems\nFound broken links in 1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n" ) # verbose mode dbverify.main(["test_dbverify", "-t", "normalization", "-v"]) out, err = capfd.readouterr() assert ( out == "check_link found the following docs in wf_TimeSeries with broken links to site_id\n////////////////Doc number 1 with error///////////////\n" + doc1_str + "\n////////////////////////////////////////////////////////\ncheck_links found no undefined linking key to normalized key= site_id\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no undefined linking key to normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\ncheck_link found the following docs in wf_TimeSeries with undefined link keys to source_id\n////////////////Doc number 1 with error///////////////\n" + doc2_str + "\n////////////////////////////////////////////////////////\n" ) # default required test dbverify.main(["test_dbverify", "-t", "required"]) out, err = capfd.readouterr() mmkeys = {"npts": 2, "delta": 1} mm_keys_str = json_util.dumps(mmkeys, indent=2) undef_keys = {"starttime": 1} undef_keys_str = json_util.dumps(undef_keys, indent=2) assert ( out == "////Results from run_check_required on collection= wf_TimeSeries\nCollection found 3 documents with type inconsistencies\nOffending keys and number found follow:\n" + mm_keys_str + "\nCollection found 1 documents with required keys that were not defined\nOffending keys and number found follow:\n" + undef_keys_str + "\n" ) # default schema_check test dbverify.main(["test_dbverify", "-t", "schema_check"]) out, err = capfd.readouterr() mmkeys = {"npts": 2, "delta": 1} mm_keys_str = json_util.dumps(mmkeys, indent=2) undef_keys = {"extra1": 1, "extra2": 2} undef_keys_str = json_util.dumps(undef_keys, indent=2) assert ( out == "check_attribute_types result for collection= wf_TimeSeries\nCollection found 3 documents with type inconsistencies\nOffending keys and number found follow:\n" + mm_keys_str + "\nCollection found 3 documents with keys not defined in the schema\nOffending keys and number found follow:\n" + undef_keys_str + "\n" )