示例#1
0
    def setup_class(self):
        client = Client('localhost')
        self.db = Database(client, 'test_dbclean')

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db['site'].insert_one({
            '_id': site_id,
            'net': 'net',
            'sta': 'sta',
            'loc': 'loc',
            'lat': 1.0,
            'lon': 1.0,
            'elev': 2.0,
            'starttime': datetime.utcnow().timestamp(),
            'endtime': datetime.utcnow().timestamp()
        })
        self.db['channel'].insert_one({
            '_id':
            channel_id,
            'net':
            'net1',
            'sta':
            'sta1',
            'loc':
            'loc1',
            'chan':
            'chan',
            'lat':
            1.1,
            'lon':
            1.1,
            'elev':
            2.1,
            'starttime':
            datetime.utcnow().timestamp(),
            'endtime':
            datetime.utcnow().timestamp(),
            'edepth':
            3.0,
            'vang':
            1.0,
            'hang':
            1.0
        })
        self.db['source'].insert_one({
            '_id': source_id,
            'lat': 1.2,
            'lon': 1.2,
            'time': datetime.utcnow().timestamp(),
            'depth': 3.1,
            'magnitude': 1.0
        })
        self.test_ts['site_id'] = site_id
        self.test_ts['source_id'] = source_id
        self.test_ts['channel_id'] = channel_id
示例#2
0
    def setup_class(self):
        client = DBClient("localhost")
        self.db = Database(client, "test_dbclean")

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db["site"].insert_one({
            "_id": site_id,
            "net": "net",
            "sta": "sta",
            "loc": "loc",
            "lat": 1.0,
            "lon": 1.0,
            "elev": 2.0,
            "starttime": datetime.utcnow().timestamp(),
            "endtime": datetime.utcnow().timestamp(),
        })
        self.db["channel"].insert_one({
            "_id":
            channel_id,
            "net":
            "net1",
            "sta":
            "sta1",
            "loc":
            "loc1",
            "chan":
            "chan",
            "lat":
            1.1,
            "lon":
            1.1,
            "elev":
            2.1,
            "starttime":
            datetime.utcnow().timestamp(),
            "endtime":
            datetime.utcnow().timestamp(),
            "edepth":
            3.0,
            "vang":
            1.0,
            "hang":
            1.0,
        })
        self.db["source"].insert_one({
            "_id": source_id,
            "lat": 1.2,
            "lon": 1.2,
            "time": datetime.utcnow().timestamp(),
            "depth": 3.1,
            "magnitude": 1.0,
        })
        self.test_ts["site_id"] = site_id
        self.test_ts["source_id"] = source_id
        self.test_ts["channel_id"] = channel_id
示例#3
0
    def get_database(self, database_name=None):
        """
        Get a database by database_name, if database_name is not specified, use the default one

        :param database_name: the name of database
        :type database_name: :class:`str`
        :return: :class:`mspasspy.db.database.Database`
        """
        if not database_name:
            return Database(self._db_client, self._default_database_name)
        return Database(self._db_client, database_name)
示例#4
0
    def setup_class(self):
        self.client = DBClient("localhost")
        self.client.drop_database("test_manager")
        db = Database(self.client, "test_manager")
        db["history_global"].drop_indexes()
        # clean up the database locally
        for col_name in db.list_collection_names():
            db[col_name].delete_many({})

        self.manager = GlobalHistoryManager(db,
                                            "test_job",
                                            collection="history_global")
示例#5
0
文件: client.py 项目: seisman/mspass
 def __getitem__(self, name):
     """
     Get a database by name.
     Raises :class:`~pymongo.errors.InvalidName` if an invalid
     database name is used.
     :Parameters:
       - `name`: the name of the database to get
     """
     return Database(self, name)
示例#6
0
 def test_get_alg_id(self):
     manager_db = Database(self.client, "test_manager")
     assert not self.manager.get_alg_id("aaa", "bbb")
     res = manager_db["history_global"].find_one({
         "alg_name":
         "new_stack",
         "parameters":
         '{"object_history": "True", "alg_id": "3"}',
     })
     assert (self.manager.get_alg_id(
         "new_stack",
         '{"object_history": "True", "alg_id": "3"}') == res["alg_id"])
示例#7
0
文件: client.py 项目: seisman/mspass
    def get_default_database(self,
                             default=None,
                             codec_options=None,
                             read_preference=None,
                             write_concern=None,
                             read_concern=None):
        if self.__default_database_name is None and default is None:
            raise pymongo.errors.ConfigurationError(
                'No default database name defined or provided.')

        return Database(self, self.__default_database_name or default,
                        codec_options, read_preference, write_concern,
                        read_concern)
示例#8
0
文件: client.py 项目: seisman/mspass
    def get_database(self,
                     name=None,
                     codec_options=None,
                     read_preference=None,
                     write_concern=None,
                     read_concern=None):
        if name is None:
            if self.__default_database_name is None:
                raise pymongo.errors.ConfigurationError(
                    'No default database defined')
            name = self.__default_database_name

        return Database(self, name, codec_options, read_preference,
                        write_concern, read_concern)
示例#9
0
    def test_object_history(self, spark_context):
        manager_db = Database(self.client, "test_manager")
        manager_db["history_global"].delete_many({})
        manager_db["history_object"].delete_many({})
        l = [get_live_timeseries() for i in range(2)]
        # add net, sta, chan, loc to avoid metadata serialization problem
        for i in range(2):
            l[i]["chan"] = "HHZ"
            l[i]["loc"] = "test_loc"
            l[i]["net"] = "test_net"
            l[i]["sta"] = "test_sta"
        spark_res = spark_map(l, self.manager, spark_context)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "filter"}) == 1
        res = manager_db["history_global"].find_one({"alg_name": "filter"})
        alg_id = res["alg_id"]
        # check status of the mspass objects
        for ts in spark_res:
            assert ts.number_of_stages() == 1
            assert ts.current_nodedata().algorithm == "filter"
            assert ts.current_nodedata().algid == str(alg_id)
            assert ts.is_volatile()

        save_res = manager_db.save_data(spark_res[0],
                                        alg_name="filter",
                                        alg_id=str(alg_id))
        # hardcode net, sta, net, loc to avoid serialization problem here, they are readonly metadata keys -> non fatal keys = 4
        assert save_res.live
        assert manager_db["history_object"].count_documents(
            {"alg_name": "filter"}) == 1
        doc = manager_db["history_object"].find_one({"alg_name": "filter"})
        assert doc
        assert doc["_id"] == spark_res[0].current_nodedata().uuid
        assert doc["wf_TimeSeries_id"] == spark_res[0]["_id"]
        assert doc["alg_id"] == str(alg_id)
        assert doc["alg_name"] == "filter"
示例#10
0
 def test_logging(self):
     alg_id = ObjectId()
     manager_db = Database(self.client, "test_manager")
     manager_db["history_global"].delete_many({})
     self.manager.logging(alg_id, "test_alg_name", "test_parameter")
     res = manager_db["history_global"].find_one(
         {"job_name": self.manager.job_name})
     assert res["job_id"] == self.manager.job_id
     assert res["job_name"] == self.manager.job_name
     assert res["alg_name"] == "test_alg_name"
     assert res["alg_id"] == alg_id
     assert res["parameters"] == "test_parameter"
     assert (manager_db["history_global"].count_documents(
         {"job_name": self.manager.job_name}) == 1)
     # clean up
     manager_db["history_global"].delete_many({})
示例#11
0
 def test_set_alg_name_and_parameters(self):
     manager_db = Database(self.client, "test_manager")
     assert (manager_db["history_global"].count_documents({
         "alg_name":
         "stack",
         "parameters":
         '{"object_history": "True", "alg_id": "3"}',
     }) == 3)
     res = manager_db["history_global"].find_one({
         "alg_name":
         "stack",
         "parameters":
         '{"object_history": "True", "alg_id": "3"}',
     })
     alg_id = res["alg_id"]
     self.manager.set_alg_name_and_parameters(alg_id, "test_alg_name",
                                              "test_parameters")
     assert (manager_db["history_global"].count_documents({
         "alg_name":
         "stack",
         "parameters":
         '{"object_history": "True", "alg_id": "3"}',
     }) == 0)
     assert (manager_db["history_global"].count_documents({
         "alg_name":
         "test_alg_name",
         "parameters":
         "test_parameters"
     }) == 3)
     res = manager_db["history_global"].find_one({
         "alg_name":
         "test_alg_name",
         "parameters":
         "test_parameters"
     })
     assert res["alg_id"] == alg_id
示例#12
0
def main(args=None):
    # As a script that would be run from the shell we let
    # any functions below that throw exception do so and assume they
    # will write a message that can help debug what went wrong
    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser(
        prog="dbverify",
        usage=
        "%(prog)s dbname [-t TEST -c [collection ...] -n [normalize ... ] -error_limit n -v]",
        description="MsPASS database verify program",
    )
    parser.add_argument(
        "dbname",
        metavar="dbname",
        type=str,
        help="MongoDB database name on which to run tests",
    )
    parser.add_argument(
        "-t",
        "--test",
        action="store",
        type=str,
        default="normalization",
        help="Select which test to run.  " +
        "Current options:  normalization, required, schema_check",
    )
    parser.add_argument(
        "-c",
        "--collection",
        action="store",
        nargs="*",
        default=["wf_TimeSeries"],
        help="Collection(s) on which the test is to be run.  " +
        "Only schema_check supports multiple collections in one run",
    )
    parser.add_argument(
        "-n",
        "--normalize",
        nargs="*",
        default=["site_id", "channel_id", "source_id"],
        help="List of normalization keys to test\n" +
        "(Used only for -test normalization option",
    )
    parser.add_argument(
        "-r",
        "--require",
        nargs="*",
        default=[],
        help="List of keys of required attributes for required test",
    )
    parser.add_argument(
        "-e",
        "--error_limit",
        action="store",
        type=int,
        default=1000,
        help="Set error limit - stop checking when this many errors are found\n"
        + "Default is 1000",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help=
        "When used print offending values.  Otherwise just return a summary",
    )

    args = parser.parse_args(args)
    test_to_run = args.test
    dbname = args.dbname
    dbclient = DBClient()
    db = Database(dbclient, dbname)
    col_to_test = args.collection
    normalize = args.normalize
    reqlist = args.require
    verbose = args.verbose
    elimit = args.error_limit

    # If python had a switch case it would be used here.  this
    # is the list of known tests.  the program can only run one
    # test per execution.  Intentional to make output more readable
    if test_to_run == "normalization":
        if len(col_to_test) > 1:
            print(
                "WARNING:  normalization test can only be run on one collection at a time"
            )
            print("Parsed a list with the following contents:  ", col_to_test)
            print("Running test on the first item in that list")
        col = col_to_test[0]
        if not isinstance(col, str):
            print("Invalid value parsed for -c option=", col)
            exit(-1)
        run_check_links(db, col, normalize, elimit, verbose)
    elif test_to_run == "required":
        if len(col_to_test) > 1:
            print(
                "WARNING:  required test can only be run on one collection at a time"
            )
            print("Parsed a list with the following contents:  ", col_to_test)
            print("Running test on the first item in that list")
        col = col_to_test[0]
        if not isinstance(col, str):
            print("Invalid value parsed for -c option=", col_to_test)
            exit(-1)
        if len(reqlist) == 0:
            # Depends on default being an empty list. For default
            # case run this small function.
            # This is currently a funtion above with const list values
            # returned for each known collection.  It may eventually
            # be replaced a function using the schema
            required_list = get_required(col)
        else:
            required_list = reqlist
        run_check_required(db, col, required_list, elimit, verbose)
    elif test_to_run == "schema_check":
        for col in col_to_test:
            run_check_attribute_types(db, col, elimit, verbose)
    else:
        print("Unrecognized value for --test value parsed=", test_to_run)
        print("Must be one of:  normalization, required, or schema_check")
示例#13
0
class TestDBVerify():
    def setup_class(self):
        client = Client('localhost')
        self.db = Database(client, 'test_dbverify')

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db['site'].insert_one({
            '_id': site_id,
            'net': 'net',
            'sta': 'sta',
            'loc': 'loc',
            'lat': 1.0,
            'lon': 1.0,
            'elev': 2.0,
            'starttime': datetime.utcnow().timestamp(),
            'endtime': datetime.utcnow().timestamp()
        })
        self.db['channel'].insert_one({
            '_id':
            channel_id,
            'net':
            'net1',
            'sta':
            'sta1',
            'loc':
            'loc1',
            'chan':
            'chan',
            'lat':
            1.1,
            'lon':
            1.1,
            'elev':
            2.1,
            'starttime':
            datetime.utcnow().timestamp(),
            'endtime':
            datetime.utcnow().timestamp(),
            'edepth':
            3.0,
            'vang':
            1.0,
            'hang':
            1.0
        })
        self.db['source'].insert_one({
            '_id': source_id,
            'lat': 1.2,
            'lon': 1.2,
            'time': datetime.utcnow().timestamp(),
            'depth': 3.1,
            'magnitude': 1.0
        })
        self.test_ts['site_id'] = site_id
        self.test_ts['source_id'] = source_id
        self.test_ts['channel_id'] = channel_id

    def test_main(self, capfd):
        self.db['wf_TimeSeries'].delete_many({})
        ts1 = copy.deepcopy(self.test_ts)
        ts2 = copy.deepcopy(self.test_ts)
        ts3 = copy.deepcopy(self.test_ts)
        logging_helper.info(ts1, 'deepcopy', '1')
        logging_helper.info(ts2, 'deepcopy', '1')
        logging_helper.info(ts3, 'deepcopy', '1')

        # fix types
        ts1['npts'] = '123'
        ts1['extra1'] = 'extra1'
        ts2['delta'] = '3'
        ts2['extra2'] = 'extra2'
        ts3['npts'] = 'xyz'
        ts3['extra2'] = 'extra2'
        # wrong normalized key
        ts1['site_id'] = ObjectId()
        ts2.erase('source_id')

        save_res_code = self.db.save_data(ts1,
                                          mode='promiscuous',
                                          storage_mode='gridfs')
        save_res_code = self.db.save_data(ts2,
                                          mode='promiscuous',
                                          storage_mode='gridfs')
        # erase required attributes
        save_res_code = self.db.save_data(ts3,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['starttime'])
        doc1 = self.db['wf_TimeSeries'].find_one({'_id': ts1['_id']})
        doc2 = self.db['wf_TimeSeries'].find_one({'_id': ts2['_id']})
        doc3 = self.db['wf_TimeSeries'].find_one({'_id': ts3['_id']})
        doc1_str = json_util.dumps(doc1, indent=2)
        doc2_str = json_util.dumps(doc2, indent=2)
        doc3_str = json_util.dumps(doc3, indent=2)

        # default normalization test
        dbverify.main(['test_dbverify', '-t', 'normalization'])
        out, err = capfd.readouterr()
        assert out == "normalization test on normalized key= site_id  found problems\nFound broken links in  1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n"

        # more than 1 collection to test
        dbverify.main([
            'test_dbverify', '-t', 'normalization', '-c', 'wf_TimeSeries',
            'site'
        ])
        out, err = capfd.readouterr()
        assert out == "WARNING:  normalization test can only be run on one collection at a time\nParsed a list with the following contents:   ['wf_TimeSeries', 'site']\nRunning test on the first item in that list\nnormalization test on normalized key= site_id  found problems\nFound broken links in  1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n"

        # verbose mode
        dbverify.main(['test_dbverify', '-t', 'normalization', '-v'])
        out, err = capfd.readouterr()
        assert out == "check_link found the following docs in  wf_TimeSeries  with broken links to  site_id\n////////////////Doc number  1  with error///////////////\n" + doc1_str + "\n////////////////////////////////////////////////////////\ncheck_links found no undefined linking key to normalized key= site_id\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no undefined linking key to normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\ncheck_link found the following docs in  wf_TimeSeries  with undefined link keys to  source_id\n////////////////Doc number  1  with error///////////////\n" + doc2_str + "\n////////////////////////////////////////////////////////\n"

        # default required test
        dbverify.main(['test_dbverify', '-t', 'required'])
        out, err = capfd.readouterr()
        mmkeys = {'npts': 2, 'delta': 1}
        mm_keys_str = json_util.dumps(mmkeys, indent=2)
        undef_keys = {'starttime': 1}
        undef_keys_str = json_util.dumps(undef_keys, indent=2)
        assert out == "////Results from run_check_required on collection= wf_TimeSeries\nCollection found  3  documents with type inconsistencies\nOffending keys and number found follow:\n" + mm_keys_str + "\nCollection found  1  documents with required keys that were not defined\nOffending keys and number found follow:\n" + undef_keys_str + "\n"

        # default schema_check test
        dbverify.main(['test_dbverify', '-t', 'schema_check'])
        out, err = capfd.readouterr()
        mmkeys = {'npts': 2, 'delta': 1}
        mm_keys_str = json_util.dumps(mmkeys, indent=2)
        undef_keys = {'extra1': 1, 'extra2': 2}
        undef_keys_str = json_util.dumps(undef_keys, indent=2)
        assert out == "check_attribute_types result for collection= wf_TimeSeries\nCollection found  3  documents with type inconsistencies\nOffending keys and number found follow:\n" + mm_keys_str + "\nCollection found  3  documents with keys not defined in the schema\nOffending keys and number found follow:\n" + undef_keys_str + "\n"
示例#14
0
    def test_mspass_map_with_filePath(self, spark_context):
        # test mapass_map for spark (file input)
        # data input of RFdecon, needed for parallelization
        d = [get_live_seismogram(71, 2.0) for i in range(5)]
        for i in range(5):
            d[i].t0 = -5

        # parameters string
        pfPath = "python/mspasspy/data/pf/RFdeconProcessor.pf"
        pf = AntelopePf(pfPath)
        pf_dict = AntelopePf2dict(pf)
        parameter_dict = collections.OrderedDict()
        parameter_dict["alg"] = "LeastSquares"
        parameter_dict["pf"] = pf_dict
        parameter_dict["object_history"] = "True"
        gTree = ParameterGTree(parameter_dict)
        json_params = json.dumps(gTree.asdict())

        data = spark_context.parallelize(d)
        spark_res = data.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=None,
            parameters=None,
        ).collect()
        manager_db = Database(self.client, "test_manager")
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 8)
        res = manager_db["history_global"].find_one({"alg_name": "RFdecon"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "RFdecon"
        assert res["parameters"] == json_params
        spark_alg_id = res["alg_id"]

        # test mspass_map for dask
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=None,
            parameters=None,
        ).compute()

        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 9)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 2)
        docs = manager_db["history_global"].find({"alg_id": spark_alg_id})
        assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id
        assert docs[0]["job_name"] == docs[1][
            "job_name"] == self.manager.job_name
        assert docs[0]["alg_name"] == docs[1]["alg_name"] == "RFdecon"
        assert docs[0]["parameters"] == docs[1]["parameters"] == json_params
        assert not docs[0]["time"] == docs[1]["time"]

        # same alg + parameters combination -> same alg_id
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=None,
            parameters=None,
        ).compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 10)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 3)

        # SPARK test user provided alg_name and parameter(exist)
        spark_alg_name = "RFdecon"
        spark_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        data = spark_context.parallelize(d)
        spark_res = data.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        ).collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 11)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 4)

        # SPARK test user provided alg_name and parameter(new)
        spark_alg_name = "RFdecon_2"
        spark_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        data = spark_context.parallelize(d)
        spark_res = data.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        ).collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 12)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "RFdecon_2"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "RFdecon_2"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "RFdecon_2"
        assert res["parameters"] == json_params
        new_spark_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_spark_alg_id}) == 1)

        # DASK test user provided alg_name and parameter(exist)
        dask_alg_name = "RFdecon"
        dask_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=dask_alg_name,
            parameters=dask_alg_parameters,
        ).compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 13)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 5)

        # DASK test user provided alg_name and parameter(new)
        dask_alg_name = "RFdecon_3"
        dask_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=dask_alg_name,
            parameters=dask_alg_parameters,
        ).compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 14)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "RFdecon_3"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "RFdecon_3"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "RFdecon_3"
        assert res["parameters"] == json_params
        new_dask_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_dask_alg_id}) == 1)
示例#15
0
    def test_mspass_reduce(self, spark_context):
        manager_db = Database(self.client, "test_manager")
        manager_db["history_global"].delete_many({})

        l = [get_live_timeseries() for i in range(5)]
        # test mspass_reduce for spark
        spark_res = spark_reduce(l, self.manager, spark_context)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 1)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 1
        res = manager_db["history_global"].find_one({"alg_name": "stack"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}'
        spark_alg_id = res["alg_id"]

        # test mspass_reduce for dask
        dask_res = dask_reduce(l, self.manager)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 2)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 2
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 1)

        docs = manager_db["history_global"].find({"alg_name": "stack"})
        for doc in docs:
            if doc["alg_id"] == spark_alg_id:
                continue
            res = doc
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}'
        # different alg -> different alg_id
        assert not res["alg_id"] == spark_alg_id
        dask_alg_id = res["alg_id"]

        # same alg + parameters combination -> same alg_id
        dask_res = dask_reduce(l, self.manager)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 3)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 3
        assert (manager_db["history_global"].count_documents(
            {"alg_id": dask_alg_id}) == 2)
        docs = manager_db["history_global"].find({"alg_id": dask_alg_id})
        doc1 = docs[0]
        doc2 = docs[1]
        assert not doc1["time"] == doc2["time"]
        assert doc1["job_id"] == doc2["job_id"]
        assert doc1["job_name"] == doc2["job_name"]
        assert doc1["alg_name"] == doc2["alg_name"]
        assert doc1["parameters"] == doc2["parameters"]

        # SPARK test user provided alg_name and parameter(exist)
        spark_alg_name = "stack"
        spark_alg_parameters = "object_history=True,alg_id=2"
        spark_res = spark_reduce(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 4)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 4
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 2)

        # SPARK test user provided alg_name and parameter(new)
        spark_alg_name = "new_stack"
        spark_alg_parameters = "object_history=True,alg_id=2"
        spark_res = spark_reduce(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 5)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "new_stack"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "new_stack"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}'

        # DASK test user provided alg_name and parameter(exist)
        dask_alg_name = "stack"
        dask_alg_parameters = "object_history=True,alg_id=3"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 6)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 5
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 3)

        # DASK test user provided alg_name and parameter(new)
        dask_alg_name = "new_stack"
        dask_alg_parameters = "object_history=True,alg_id=3"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 7)
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "new_stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 1)
        res = manager_db["history_global"].find_one({
            "alg_name":
            "new_stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        })
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}'
示例#16
0
    def test_mspass_map(self, spark_context):
        l = [get_live_timeseries() for i in range(5)]
        # add net, sta, chan, loc to avoid metadata serialization problem
        for i in range(5):
            l[i]["chan"] = "HHZ"
            l[i]["loc"] = "test_loc"
            l[i]["net"] = "test_net"
            l[i]["sta"] = "test_sta"
            l[i].set_as_origin("test", "0", str(i), AtomicType.TIMESERIES)
        # test mspass_map for spark
        spark_res = spark_map(l, self.manager, spark_context)

        manager_db = Database(self.client, "test_manager")
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 1)
        res = manager_db["history_global"].find_one(
            {"job_name": self.manager.job_name})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "filter"
        assert (
            res["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        spark_alg_id = res["alg_id"]

        # test mspass_map for dask
        dask_res = dask_map(l, self.manager)

        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 2)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 2)
        docs = manager_db["history_global"].find({"alg_id": spark_alg_id})
        assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id
        assert docs[0]["job_name"] == docs[1][
            "job_name"] == self.manager.job_name
        assert docs[0]["alg_name"] == docs[1]["alg_name"] == "filter"
        assert (
            docs[0]["parameters"] == docs[1]["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        assert not docs[0]["time"] == docs[1]["time"]

        # same alg + parameters combination -> same alg_id
        dask_res = dask_map(l, self.manager)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 3)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 3)

        # SPARK test user provided alg_name and parameter(exist)
        spark_alg_name = "filter"
        spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        spark_res = spark_map(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 4)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 4)

        # SPARK test user provided alg_name and parameter(new)
        spark_alg_name = "new_filter"
        spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        spark_res = spark_map(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 5)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "new_filter"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "new_filter"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_filter"
        assert (
            res["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        new_spark_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_spark_alg_id}) == 1)

        # DASK test user provided alg_name and parameter(exist)
        dask_alg_name = "filter"
        dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 6)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 5)

        # DASK test user provided alg_name and parameter(new)
        dask_alg_name = "new_filter_2"
        dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 7)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "new_filter_2"}) == 1)
        res = manager_db["history_global"].find_one(
            {"alg_name": "new_filter_2"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_filter_2"
        assert (
            res["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        new_dask_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_dask_alg_id}) == 1)

        manager_db["history_object"].delete_many({})
        # test spark mspass_map for save_data
        data = spark_context.parallelize(l)
        data_map = data.mspass_map(manager_db.save_data,
                                   global_history=self.manager)
        save_list = data_map.collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 8)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "save_data"}) == 1)
        # check object history after save_data
        manager_db["history_object"].count_documents({}) == 5
        manager_db["wf_TimeSeries"].count_documents({}) == 5
        history_object_docs = manager_db["history_object"].find({})
        idx = 0
        doc_alg_id = None
        doc_ids = []
        for doc in history_object_docs:
            if not doc_alg_id:
                doc_alg_id = doc["alg_id"]
            else:
                assert doc_alg_id == doc["alg_id"]
            doc_ids.append(doc["_id"])
            assert doc["alg_name"] == "save_data"
            idx += 1
        assert sorted(doc_ids) == ["0", "1", "2", "3", "4"]

        # test spark mspass_map for read_data
        save_l = [res[1] for res in save_list]
        data = spark_context.parallelize(save_l)
        data_map = data.mspass_map(manager_db.read_data,
                                   global_history=self.manager)
        read_list = data_map.collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 9)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "read_data"}) == 1)

        manager_db["history_object"].delete_many({})
        manager_db["wf_TimeSeries"].delete_many({})
        # test dask mspass_map for save_data
        data = daskbag.from_sequence(l)
        data_map = data.mspass_map(manager_db.save_data,
                                   global_history=self.manager)
        save_list = data_map.compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 10)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "save_data"}) == 2)
        res = manager_db["history_global"].find({"alg_name": "save_data"})
        assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id
        assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name
        assert res[0]["alg_name"] == res[1]["alg_name"] == "save_data"
        assert (res[0]["parameters"] == res[1]["parameters"] ==
                '{"object_history": "False"}')
        assert res[0]["alg_id"] == res[1]["alg_id"]
        # check object history after save_data
        manager_db["history_object"].count_documents({}) == 5
        manager_db["wf_TimeSeries"].count_documents({}) == 5
        history_object_docs = manager_db["history_object"].find({})
        idx = 0
        doc_alg_id = None
        doc_ids = []
        for doc in history_object_docs:
            if not doc_alg_id:
                doc_alg_id = doc["alg_id"]
            else:
                assert doc_alg_id == doc["alg_id"]
            doc_ids.append(doc["_id"])
            assert doc["alg_name"] == "save_data"
            idx += 1
        assert sorted(doc_ids) == ["0", "1", "2", "3", "4"]

        # test dask mspass_map for read_data
        save_l = [res[1] for res in save_list]
        data = daskbag.from_sequence(save_l)
        data_map = data.mspass_map(manager_db.read_data,
                                   global_history=self.manager)
        read_list = data_map.compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 11)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "read_data"}) == 2)
        res = manager_db["history_global"].find({"alg_name": "read_data"})
        assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id
        assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name
        assert res[0]["alg_name"] == res[1]["alg_name"] == "read_data"
        assert (res[0]["parameters"] == res[1]["parameters"] ==
                '{"object_history": "False"}')
        assert res[0]["alg_id"] == res[1]["alg_id"]
示例#17
0
class TestDBClean():
    def setup_class(self):
        client = Client('localhost')
        self.db = Database(client, 'test_dbclean')

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db['site'].insert_one({
            '_id': site_id,
            'net': 'net',
            'sta': 'sta',
            'loc': 'loc',
            'lat': 1.0,
            'lon': 1.0,
            'elev': 2.0,
            'starttime': datetime.utcnow().timestamp(),
            'endtime': datetime.utcnow().timestamp()
        })
        self.db['channel'].insert_one({
            '_id':
            channel_id,
            'net':
            'net1',
            'sta':
            'sta1',
            'loc':
            'loc1',
            'chan':
            'chan',
            'lat':
            1.1,
            'lon':
            1.1,
            'elev':
            2.1,
            'starttime':
            datetime.utcnow().timestamp(),
            'endtime':
            datetime.utcnow().timestamp(),
            'edepth':
            3.0,
            'vang':
            1.0,
            'hang':
            1.0
        })
        self.db['source'].insert_one({
            '_id': source_id,
            'lat': 1.2,
            'lon': 1.2,
            'time': datetime.utcnow().timestamp(),
            'depth': 3.1,
            'magnitude': 1.0
        })
        self.test_ts['site_id'] = site_id
        self.test_ts['source_id'] = source_id
        self.test_ts['channel_id'] = channel_id

    def test_rename_list_to_dict(self):
        rlist = ['a:1', 'b:2', 'c:3']
        result = dbclean.rename_list_to_dict(rlist)
        assert len(result) == 3
        assert result == {'a': '1', 'b': '2', 'c': '3'}

        rlist = ['a:1:2']
        with pytest.raises(SystemExit) as e:
            dbclean.rename_list_to_dict(rlist)
        assert e.type == SystemExit
        assert e.value.code == -1

    def test_main(self):
        self.db['wf_TimeSeries'].delete_many({})
        ts1 = copy.deepcopy(self.test_ts)
        ts2 = copy.deepcopy(self.test_ts)
        ts3 = copy.deepcopy(self.test_ts)
        logging_helper.info(ts1, 'deepcopy', '1')
        logging_helper.info(ts2, 'deepcopy', '1')
        logging_helper.info(ts3, 'deepcopy', '1')

        # fix types
        ts1['npts'] = '123'
        ts2['delta'] = '3'
        ts3['npts'] = 'xyz'

        save_res_code = self.db.save_data(ts1,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['extra2'])
        save_res_code = self.db.save_data(ts2,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['extra2'])
        save_res_code = self.db.save_data(ts3,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['extra2'])

        # exit
        with pytest.raises(SystemExit) as e:
            dbclean.main(['test_dbclean', 'wf_TimeSeries'])
        assert e.type == SystemExit
        assert e.value.code == -1

        # delete starttime attribute
        # rename calib to rename_calib
        dbclean.main([
            'test_dbclean', 'wf_TimeSeries', '-ft', '-d', 'starttime', '-r',
            'calib:rename_calib'
        ])

        res1 = self.db['wf_TimeSeries'].find_one({'_id': ts1['_id']})
        res2 = self.db['wf_TimeSeries'].find_one({'_id': ts2['_id']})
        res3 = self.db['wf_TimeSeries'].find_one({'_id': ts3['_id']})

        assert res1['npts'] == 123
        assert 'starttime' not in res1
        assert 'calib' not in res1
        assert 'rename_calib' in res1

        assert res2['delta'] == 3.0
        assert 'starttime' not in res2
        assert 'calib' not in res2
        assert 'rename_calib' in res2

        # can't be fixed
        assert res3['npts'] == 'xyz'
        assert 'starttime' not in res3
        assert 'calib' not in res3
        assert 'rename_calib' in res3

        self.db['wf_TimeSeries'].delete_many({})
        ts1 = copy.deepcopy(self.test_ts)
        ts2 = copy.deepcopy(self.test_ts)
        ts3 = copy.deepcopy(self.test_ts)
        logging_helper.info(ts1, 'deepcopy', '1')
        logging_helper.info(ts2, 'deepcopy', '1')
        logging_helper.info(ts3, 'deepcopy', '1')

        # fix types
        ts1['npts'] = '123'
        ts2['delta'] = '3'
        ts3['npts'] = 'xyz'

        save_res_code = self.db.save_data(ts1,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['extra2'])
        save_res_code = self.db.save_data(ts2,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['extra2'])
        save_res_code = self.db.save_data(ts3,
                                          mode='promiscuous',
                                          storage_mode='gridfs',
                                          exclude_keys=['extra2'])

        # only fix types
        dbclean.main(['test_dbclean', 'wf_TimeSeries', '-ft'])
        assert res1['npts'] == 123
        assert res2['delta'] == 3.0
        # can't be fixed
        assert res3['npts'] == 'xyz'
示例#18
0
    def __init__(
        self,
        database_host=None,
        scheduler=None,
        scheduler_host=None,
        job_name="mspass",
        database_name="mspass",
        schema=None,
        collection=None,
    ):
        # job_name should be a string
        if database_host is not None and not type(database_host) is str:
            raise MsPASSError(
                "database_host should be a string but " +
                str(type(database_host)) + " is found.",
                "Fatal",
            )
        if scheduler is not None and scheduler != "dask" and scheduler != "spark":
            raise MsPASSError(
                "scheduler should be either dask or spark but " +
                str(scheduler) + " is found.",
                "Fatal",
            )
        if scheduler_host is not None and not type(scheduler_host) is str:
            raise MsPASSError(
                "scheduler_host should be a string but " +
                str(type(scheduler_host)) + " is found.",
                "Fatal",
            )
        if job_name is not None and not type(job_name) is str:
            raise MsPASSError(
                "job_name should be a string but " + str(type(job_name)) +
                " is found.",
                "Fatal",
            )
        if database_name is not None and not type(database_name) is str:
            raise MsPASSError(
                "database_name should be a string but " +
                str(type(database_name)) + " is found.",
                "Fatal",
            )
        # collection should be a string
        if collection is not None and type(collection) is not str:
            raise MsPASSError(
                "collection should be a string but " + str(type(collection)) +
                " is found.",
                "Fatal",
            )

        # check env variables
        MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS")
        MONGODB_PORT = os.environ.get("MONGODB_PORT")
        MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER")
        MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS")
        DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT")
        SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT")

        # create a database client
        # priority: parameter -> env -> default
        database_host_has_port = False
        if database_host:
            database_address = database_host
            # check if database_host contains port number already
            if ":" in database_address:
                database_host_has_port = True

        elif MSPASS_DB_ADDRESS:
            database_address = MSPASS_DB_ADDRESS
        else:
            database_address = "localhost"
        # add port
        if not database_host_has_port and MONGODB_PORT:
            database_address += ":" + MONGODB_PORT

        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )

        # set default database name
        self._default_database_name = database_name
        self._default_schema = schema
        self._default_collection = collection

        # create a Global History Manager
        if schema:
            global_history_manager_db = Database(self._db_client,
                                                 database_name,
                                                 db_schema=schema)
        else:
            global_history_manager_db = Database(self._db_client,
                                                 database_name)
        self._global_history_manager = GlobalHistoryManager(
            global_history_manager_db, job_name, collection=collection)

        # set scheduler
        if scheduler:
            self._scheduler = scheduler
        elif MSPASS_SCHEDULER:
            self._scheduler = MSPASS_SCHEDULER
        else:
            self._scheduler = "dask"

        # scheduler configuration
        if self._scheduler == "spark":
            scheduler_host_has_port = False
            if scheduler_host:
                self._spark_master_url = scheduler_host
                # add spark:// prefix if not exist
                if "spark://" not in scheduler_host:
                    self._spark_master_url = "spark://" + self._spark_master_url
                # check if spark host address contains port number already
                if self._spark_master_url.count(":") == 2:
                    scheduler_host_has_port = True

            elif MSPASS_SCHEDULER_ADDRESS:
                self._spark_master_url = MSPASS_SCHEDULER_ADDRESS
                # add spark:// prefix if not exist
                if "spark://" not in MSPASS_SCHEDULER_ADDRESS:
                    self._spark_master_url = "spark://" + self._spark_master_url
            else:
                self._spark_master_url = "local"

            # add port number
            # 1. not the default 'local'
            # 2. scheduler_host and does not contain port number
            # 3. SPARK_MASTER_PORT exists
            if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS)
                    and not scheduler_host_has_port and SPARK_MASTER_PORT):
                self._spark_master_url += ":" + SPARK_MASTER_PORT

            # sanity check
            try:
                spark = (SparkSession.builder.appName("mspass").master(
                    self._spark_master_url).getOrCreate())
                self._spark_context = spark.sparkContext
            except Exception as err:
                raise MsPASSError(
                    "Runntime error: cannot create a spark configuration with: "
                    + self._spark_master_url,
                    "Fatal",
                )

        elif self._scheduler == "dask":
            # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client
            if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS:
                self._dask_client = DaskClient()
            else:
                scheduler_host_has_port = False
                # set host
                if scheduler_host:
                    self._dask_client_address = scheduler_host
                    # check if scheduler_host contains port number already
                    if ":" in scheduler_host:
                        scheduler_host_has_port = True
                else:
                    self._dask_client_address = MSPASS_SCHEDULER_ADDRESS

                # add port
                if not scheduler_host_has_port and DASK_SCHEDULER_PORT:
                    self._dask_client_address += ":" + DASK_SCHEDULER_PORT
                else:
                    # use to port 8786 by default if not specified
                    self._dask_client_address += ":8786"
                # sanity check
                try:
                    self._dask_client = DaskClient(self._dask_client_address)
                except Exception as err:
                    raise MsPASSError(
                        "Runntime error: cannot create a dask client with: " +
                        self._dask_client_address,
                        "Fatal",
                    )
示例#19
0
class TestDBClean:
    def setup_class(self):
        client = DBClient("localhost")
        self.db = Database(client, "test_dbclean")

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db["site"].insert_one({
            "_id": site_id,
            "net": "net",
            "sta": "sta",
            "loc": "loc",
            "lat": 1.0,
            "lon": 1.0,
            "elev": 2.0,
            "starttime": datetime.utcnow().timestamp(),
            "endtime": datetime.utcnow().timestamp(),
        })
        self.db["channel"].insert_one({
            "_id":
            channel_id,
            "net":
            "net1",
            "sta":
            "sta1",
            "loc":
            "loc1",
            "chan":
            "chan",
            "lat":
            1.1,
            "lon":
            1.1,
            "elev":
            2.1,
            "starttime":
            datetime.utcnow().timestamp(),
            "endtime":
            datetime.utcnow().timestamp(),
            "edepth":
            3.0,
            "vang":
            1.0,
            "hang":
            1.0,
        })
        self.db["source"].insert_one({
            "_id": source_id,
            "lat": 1.2,
            "lon": 1.2,
            "time": datetime.utcnow().timestamp(),
            "depth": 3.1,
            "magnitude": 1.0,
        })
        self.test_ts["site_id"] = site_id
        self.test_ts["source_id"] = source_id
        self.test_ts["channel_id"] = channel_id

    def test_rename_list_to_dict(self):
        rlist = ["a:1", "b:2", "c:3"]
        result = dbclean.rename_list_to_dict(rlist)
        assert len(result) == 3
        assert result == {"a": "1", "b": "2", "c": "3"}

        rlist = ["a:1:2"]
        with pytest.raises(SystemExit) as e:
            dbclean.rename_list_to_dict(rlist)
        assert e.type == SystemExit
        assert e.value.code == -1

    def test_main(self):
        self.db["wf_TimeSeries"].delete_many({})
        ts1 = copy.deepcopy(self.test_ts)
        ts2 = copy.deepcopy(self.test_ts)
        ts3 = copy.deepcopy(self.test_ts)
        logging_helper.info(ts1, "1", "deepcopy")
        logging_helper.info(ts2, "1", "deepcopy")
        logging_helper.info(ts3, "1", "deepcopy")

        # fix types
        ts1["npts"] = "123"
        ts2["delta"] = "3"
        ts3["npts"] = "xyz"

        save_res_code = self.db.save_data(ts1,
                                          mode="promiscuous",
                                          storage_mode="gridfs",
                                          exclude_keys=["extra2"])
        save_res_code = self.db.save_data(ts2,
                                          mode="promiscuous",
                                          storage_mode="gridfs",
                                          exclude_keys=["extra2"])
        save_res_code = self.db.save_data(ts3,
                                          mode="promiscuous",
                                          storage_mode="gridfs",
                                          exclude_keys=["extra2"])

        # exit
        with pytest.raises(SystemExit) as e:
            dbclean.main(["test_dbclean", "wf_TimeSeries"])
        assert e.type == SystemExit
        assert e.value.code == -1

        # delete starttime attribute
        # rename calib to rename_calib
        dbclean.main([
            "test_dbclean",
            "wf_TimeSeries",
            "-ft",
            "-d",
            "starttime",
            "-r",
            "calib:rename_calib",
        ])

        res1 = self.db["wf_TimeSeries"].find_one({"_id": ts1["_id"]})
        res2 = self.db["wf_TimeSeries"].find_one({"_id": ts2["_id"]})
        res3 = self.db["wf_TimeSeries"].find_one({"_id": ts3["_id"]})

        assert res1["npts"] == 123
        assert "starttime" not in res1
        assert "calib" not in res1
        assert "rename_calib" in res1

        assert res2["delta"] == 3.0
        assert "starttime" not in res2
        assert "calib" not in res2
        assert "rename_calib" in res2

        # can't be fixed
        assert res3["npts"] == "xyz"
        assert "starttime" not in res3
        assert "calib" not in res3
        assert "rename_calib" in res3

        self.db["wf_TimeSeries"].delete_many({})
        ts1 = copy.deepcopy(self.test_ts)
        ts2 = copy.deepcopy(self.test_ts)
        ts3 = copy.deepcopy(self.test_ts)
        logging_helper.info(ts1, "1", "deepcopy")
        logging_helper.info(ts2, "1", "deepcopy")
        logging_helper.info(ts3, "1", "deepcopy")

        # fix types
        ts1["npts"] = "123"
        ts2["delta"] = "3"
        ts3["npts"] = "xyz"

        save_res_code = self.db.save_data(ts1,
                                          mode="promiscuous",
                                          storage_mode="gridfs",
                                          exclude_keys=["extra2"])
        save_res_code = self.db.save_data(ts2,
                                          mode="promiscuous",
                                          storage_mode="gridfs",
                                          exclude_keys=["extra2"])
        save_res_code = self.db.save_data(ts3,
                                          mode="promiscuous",
                                          storage_mode="gridfs",
                                          exclude_keys=["extra2"])

        # only fix types
        dbclean.main(["test_dbclean", "wf_TimeSeries", "-ft"])
        assert res1["npts"] == 123
        assert res2["delta"] == 3.0
        # can't be fixed
        assert res3["npts"] == "xyz"
示例#20
0
def main(args=None):
    """ """
    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser(
        prog="dbclean",
        usage=
        "%(prog)s dbname collection [-ft] [-d k1 ...] [-r kold:knew ... ] [-v] [-h]",
        description="MsPASS program to fix most errors detected by dbverify",
    )
    parser.add_argument("dbname",
                        metavar="dbname",
                        type=str,
                        help="MongoDB database name to be fixed")
    parser.add_argument(
        "collection",
        metavar="collection",
        type=str,
        help="MongoDB collection name to be fixed",
    )
    parser.add_argument(
        "-ft",
        "--fixtypes",
        action="store_true",
        help="Enable automatic type mismatch repair",
    )
    parser.add_argument(
        "-d",
        "--delete",
        nargs="*",
        default=[],
        help="List of keys of key-value pairs to be deleted from all documents",
    )
    parser.add_argument(
        "-r",
        "--rename",
        nargs="*",
        default=[],
        help=
        "Change the keys of documents using pattern defined in args of form oldkey:newkey",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="When used be echo each fix - default works silently",
    )

    args = parser.parse_args(args)
    dbname = args.dbname
    collection = args.collection
    fixtypes = args.fixtypes
    delete = args.delete
    rename = args.rename
    verbose = args.verbose

    # not a very robust way to detect this condition but it should work
    # it is not robust because it assumes a behavior in argparse for
    # args with a list
    if len(delete) > 0:
        enable_deletion = True
    else:
        enable_deletion = False
    if len(rename) > 0:
        enable_rename = True
    else:
        enable_rename = False
    if not (fixtypes or enable_deletion or enable_rename):
        print("Usage error:  you must define at least one clean operation")
        print("Type:  dbclean --help to get usage help")
        exit(-1)

    if enable_rename:
        rename_map = rename_list_to_dict(rename)

    dbclient = DBClient()
    db = Database(dbclient, dbname)
    print("Starting processing of ", collection,
          " collection of database named=", dbname)

    # Intentionally do the delete and rename operations before
    # a type check to allow cleaning any keys. The set of dicts below
    # accumulate counts of edits for each key

    if enable_deletion:
        delcounts = db._delete_attributes(collection, delete, verbose=verbose)
        print("delete processing compeleted on collection=", collection)
        print("Number of documents changed for each key requested follow:")
        print(json_util.dumps(delcounts, indent=4))
    if enable_rename:
        repcounts = db._rename_attributes(collection,
                                          rename_map,
                                          verbose=verbose)
        print("rename processing compeleted on collection=", collection)
        print("Here is the set of changes requested:")
        print(json_util.dumps(rename_map))
        print("Number of documents changed for each key requested follow:")
        print(json_util.dumps(repcounts, indent=4))
    if fixtypes:
        fixcounts = db._fix_attribute_types(collection, verbose=verbose)
        print("fixtype processing compeleted on collection=", collection)
        print("Keys of documents changed and number changed follow:")
        print(json_util.dumps(fixcounts, indent=4))
示例#21
0
class TestDBVerify:
    def setup_class(self):
        client = DBClient("localhost")
        self.db = Database(client, "test_dbverify")

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db["site"].insert_one(
            {
                "_id": site_id,
                "net": "net",
                "sta": "sta",
                "loc": "loc",
                "lat": 1.0,
                "lon": 1.0,
                "elev": 2.0,
                "starttime": datetime.utcnow().timestamp(),
                "endtime": datetime.utcnow().timestamp(),
            }
        )
        self.db["channel"].insert_one(
            {
                "_id": channel_id,
                "net": "net1",
                "sta": "sta1",
                "loc": "loc1",
                "chan": "chan",
                "lat": 1.1,
                "lon": 1.1,
                "elev": 2.1,
                "starttime": datetime.utcnow().timestamp(),
                "endtime": datetime.utcnow().timestamp(),
                "edepth": 3.0,
                "vang": 1.0,
                "hang": 1.0,
            }
        )
        self.db["source"].insert_one(
            {
                "_id": source_id,
                "lat": 1.2,
                "lon": 1.2,
                "time": datetime.utcnow().timestamp(),
                "depth": 3.1,
                "magnitude": 1.0,
            }
        )
        self.test_ts["site_id"] = site_id
        self.test_ts["source_id"] = source_id
        self.test_ts["channel_id"] = channel_id

    def test_main(self, capfd):
        self.db["wf_TimeSeries"].delete_many({})
        ts1 = copy.deepcopy(self.test_ts)
        ts2 = copy.deepcopy(self.test_ts)
        ts3 = copy.deepcopy(self.test_ts)
        logging_helper.info(ts1, "1", "deepcopy")
        logging_helper.info(ts2, "1", "deepcopy")
        logging_helper.info(ts3, "1", "deepcopy")

        # fix types
        ts1["npts"] = "123"
        ts1["extra1"] = "extra1"
        ts2["delta"] = "3"
        ts2["extra2"] = "extra2"
        ts3["npts"] = "xyz"
        ts3["extra2"] = "extra2"
        # wrong normalized key
        ts1["site_id"] = ObjectId()
        ts2.erase("source_id")

        save_res_code = self.db.save_data(
            ts1, mode="promiscuous", storage_mode="gridfs"
        )
        save_res_code = self.db.save_data(
            ts2, mode="promiscuous", storage_mode="gridfs"
        )
        # erase required attributes
        save_res_code = self.db.save_data(
            ts3, mode="promiscuous", storage_mode="gridfs", exclude_keys=["starttime"]
        )
        doc1 = self.db["wf_TimeSeries"].find_one({"_id": ts1["_id"]})
        doc2 = self.db["wf_TimeSeries"].find_one({"_id": ts2["_id"]})
        doc3 = self.db["wf_TimeSeries"].find_one({"_id": ts3["_id"]})
        doc1_str = json_util.dumps(doc1, indent=2)
        doc2_str = json_util.dumps(doc2, indent=2)
        doc3_str = json_util.dumps(doc3, indent=2)

        # default normalization test
        dbverify.main(["test_dbverify", "-t", "normalization"])
        out, err = capfd.readouterr()
        assert (
            out
            == "normalization test on normalized key= site_id  found problems\nFound broken links in  1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n"
        )

        # more than 1 collection to test
        dbverify.main(
            ["test_dbverify", "-t", "normalization", "-c", "wf_TimeSeries", "site"]
        )
        out, err = capfd.readouterr()
        assert (
            out
            == "WARNING:  normalization test can only be run on one collection at a time\nParsed a list with the following contents:   ['wf_TimeSeries', 'site']\nRunning test on the first item in that list\nnormalization test on normalized key= site_id  found problems\nFound broken links in  1 documents checked\nNote error count limit= 1000\nIf the count is the same it means all data probably contain missing cross referencing ids\nRun in verbose mode to find out more information you will need to fix the problem\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\n"
        )

        # verbose mode
        dbverify.main(["test_dbverify", "-t", "normalization", "-v"])
        out, err = capfd.readouterr()
        assert (
            out
            == "check_link found the following docs in  wf_TimeSeries  with broken links to  site_id\n////////////////Doc number  1  with error///////////////\n"
            + doc1_str
            + "\n////////////////////////////////////////////////////////\ncheck_links found no undefined linking key to normalized key= site_id\ncheck_links found no broken links with normalized key= channel_id\ncheck_links found no undefined linking key to normalized key= channel_id\ncheck_links found no broken links with normalized key= source_id\ncheck_link found the following docs in  wf_TimeSeries  with undefined link keys to  source_id\n////////////////Doc number  1  with error///////////////\n"
            + doc2_str
            + "\n////////////////////////////////////////////////////////\n"
        )

        # default required test
        dbverify.main(["test_dbverify", "-t", "required"])
        out, err = capfd.readouterr()
        mmkeys = {"npts": 2, "delta": 1}
        mm_keys_str = json_util.dumps(mmkeys, indent=2)
        undef_keys = {"starttime": 1}
        undef_keys_str = json_util.dumps(undef_keys, indent=2)
        assert (
            out
            == "////Results from run_check_required on collection= wf_TimeSeries\nCollection found  3  documents with type inconsistencies\nOffending keys and number found follow:\n"
            + mm_keys_str
            + "\nCollection found  1  documents with required keys that were not defined\nOffending keys and number found follow:\n"
            + undef_keys_str
            + "\n"
        )

        # default schema_check test
        dbverify.main(["test_dbverify", "-t", "schema_check"])
        out, err = capfd.readouterr()
        mmkeys = {"npts": 2, "delta": 1}
        mm_keys_str = json_util.dumps(mmkeys, indent=2)
        undef_keys = {"extra1": 1, "extra2": 2}
        undef_keys_str = json_util.dumps(undef_keys, indent=2)
        assert (
            out
            == "check_attribute_types result for collection= wf_TimeSeries\nCollection found  3  documents with type inconsistencies\nOffending keys and number found follow:\n"
            + mm_keys_str
            + "\nCollection found  3  documents with keys not defined in the schema\nOffending keys and number found follow:\n"
            + undef_keys_str
            + "\n"
        )