Exemplo n.º 1
0
    def set_database_client(self, database_host, database_port=None):
        """
        Set a database client by database_host(and database_port)

        :param database_host: the host address of database client
        :type database_host: :class:`str`
        :param database_port: the port of database client
        :type database_port: :class:`str`
        """
        database_host_has_port = False
        database_address = database_host
        # check if port is already in the database_host address
        if ":" in database_host:
            database_host_has_port = True
        # add port
        if not database_host_has_port and database_port:
            database_address += ":" + database_port
        # sanity check
        temp_db_client = self._db_client
        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            # restore the _db_client
            self._db_client = temp_db_client
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )
Exemplo n.º 2
0
    def setup_class(self):
        client = DBClient("localhost")
        self.db = Database(client, "test_dbclean")

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db["site"].insert_one({
            "_id": site_id,
            "net": "net",
            "sta": "sta",
            "loc": "loc",
            "lat": 1.0,
            "lon": 1.0,
            "elev": 2.0,
            "starttime": datetime.utcnow().timestamp(),
            "endtime": datetime.utcnow().timestamp(),
        })
        self.db["channel"].insert_one({
            "_id":
            channel_id,
            "net":
            "net1",
            "sta":
            "sta1",
            "loc":
            "loc1",
            "chan":
            "chan",
            "lat":
            1.1,
            "lon":
            1.1,
            "elev":
            2.1,
            "starttime":
            datetime.utcnow().timestamp(),
            "endtime":
            datetime.utcnow().timestamp(),
            "edepth":
            3.0,
            "vang":
            1.0,
            "hang":
            1.0,
        })
        self.db["source"].insert_one({
            "_id": source_id,
            "lat": 1.2,
            "lon": 1.2,
            "time": datetime.utcnow().timestamp(),
            "depth": 3.1,
            "magnitude": 1.0,
        })
        self.test_ts["site_id"] = site_id
        self.test_ts["source_id"] = source_id
        self.test_ts["channel_id"] = channel_id
Exemplo n.º 3
0
    def setup_class(self):
        self.client = DBClient("localhost")
        self.client.drop_database("test_manager")
        db = Database(self.client, "test_manager")
        db["history_global"].drop_indexes()
        # clean up the database locally
        for col_name in db.list_collection_names():
            db[col_name].delete_many({})

        self.manager = GlobalHistoryManager(db,
                                            "test_job",
                                            collection="history_global")
Exemplo n.º 4
0
def main(args=None):
    """ """
    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser(
        prog="dbclean",
        usage=
        "%(prog)s dbname collection [-ft] [-d k1 ...] [-r kold:knew ... ] [-v] [-h]",
        description="MsPASS program to fix most errors detected by dbverify",
    )
    parser.add_argument("dbname",
                        metavar="dbname",
                        type=str,
                        help="MongoDB database name to be fixed")
    parser.add_argument(
        "collection",
        metavar="collection",
        type=str,
        help="MongoDB collection name to be fixed",
    )
    parser.add_argument(
        "-ft",
        "--fixtypes",
        action="store_true",
        help="Enable automatic type mismatch repair",
    )
    parser.add_argument(
        "-d",
        "--delete",
        nargs="*",
        default=[],
        help="List of keys of key-value pairs to be deleted from all documents",
    )
    parser.add_argument(
        "-r",
        "--rename",
        nargs="*",
        default=[],
        help=
        "Change the keys of documents using pattern defined in args of form oldkey:newkey",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="When used be echo each fix - default works silently",
    )

    args = parser.parse_args(args)
    dbname = args.dbname
    collection = args.collection
    fixtypes = args.fixtypes
    delete = args.delete
    rename = args.rename
    verbose = args.verbose

    # not a very robust way to detect this condition but it should work
    # it is not robust because it assumes a behavior in argparse for
    # args with a list
    if len(delete) > 0:
        enable_deletion = True
    else:
        enable_deletion = False
    if len(rename) > 0:
        enable_rename = True
    else:
        enable_rename = False
    if not (fixtypes or enable_deletion or enable_rename):
        print("Usage error:  you must define at least one clean operation")
        print("Type:  dbclean --help to get usage help")
        exit(-1)

    if enable_rename:
        rename_map = rename_list_to_dict(rename)

    dbclient = DBClient()
    db = Database(dbclient, dbname)
    print("Starting processing of ", collection,
          " collection of database named=", dbname)

    # Intentionally do the delete and rename operations before
    # a type check to allow cleaning any keys. The set of dicts below
    # accumulate counts of edits for each key

    if enable_deletion:
        delcounts = db._delete_attributes(collection, delete, verbose=verbose)
        print("delete processing compeleted on collection=", collection)
        print("Number of documents changed for each key requested follow:")
        print(json_util.dumps(delcounts, indent=4))
    if enable_rename:
        repcounts = db._rename_attributes(collection,
                                          rename_map,
                                          verbose=verbose)
        print("rename processing compeleted on collection=", collection)
        print("Here is the set of changes requested:")
        print(json_util.dumps(rename_map))
        print("Number of documents changed for each key requested follow:")
        print(json_util.dumps(repcounts, indent=4))
    if fixtypes:
        fixcounts = db._fix_attribute_types(collection, verbose=verbose)
        print("fixtype processing compeleted on collection=", collection)
        print("Keys of documents changed and number changed follow:")
        print(json_util.dumps(fixcounts, indent=4))
Exemplo n.º 5
0
 def setup_class(self):
     self.c1 = DBClient("mongodb://localhost/my_database")
     self.c2 = DBClient("localhost")
Exemplo n.º 6
0
    def __init__(
        self,
        database_host=None,
        scheduler=None,
        scheduler_host=None,
        job_name="mspass",
        database_name="mspass",
        schema=None,
        collection=None,
    ):
        # job_name should be a string
        if database_host is not None and not type(database_host) is str:
            raise MsPASSError(
                "database_host should be a string but " +
                str(type(database_host)) + " is found.",
                "Fatal",
            )
        if scheduler is not None and scheduler != "dask" and scheduler != "spark":
            raise MsPASSError(
                "scheduler should be either dask or spark but " +
                str(scheduler) + " is found.",
                "Fatal",
            )
        if scheduler_host is not None and not type(scheduler_host) is str:
            raise MsPASSError(
                "scheduler_host should be a string but " +
                str(type(scheduler_host)) + " is found.",
                "Fatal",
            )
        if job_name is not None and not type(job_name) is str:
            raise MsPASSError(
                "job_name should be a string but " + str(type(job_name)) +
                " is found.",
                "Fatal",
            )
        if database_name is not None and not type(database_name) is str:
            raise MsPASSError(
                "database_name should be a string but " +
                str(type(database_name)) + " is found.",
                "Fatal",
            )
        # collection should be a string
        if collection is not None and type(collection) is not str:
            raise MsPASSError(
                "collection should be a string but " + str(type(collection)) +
                " is found.",
                "Fatal",
            )

        # check env variables
        MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS")
        MONGODB_PORT = os.environ.get("MONGODB_PORT")
        MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER")
        MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS")
        DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT")
        SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT")

        # create a database client
        # priority: parameter -> env -> default
        database_host_has_port = False
        if database_host:
            database_address = database_host
            # check if database_host contains port number already
            if ":" in database_address:
                database_host_has_port = True

        elif MSPASS_DB_ADDRESS:
            database_address = MSPASS_DB_ADDRESS
        else:
            database_address = "localhost"
        # add port
        if not database_host_has_port and MONGODB_PORT:
            database_address += ":" + MONGODB_PORT

        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )

        # set default database name
        self._default_database_name = database_name
        self._default_schema = schema
        self._default_collection = collection

        # create a Global History Manager
        if schema:
            global_history_manager_db = Database(self._db_client,
                                                 database_name,
                                                 db_schema=schema)
        else:
            global_history_manager_db = Database(self._db_client,
                                                 database_name)
        self._global_history_manager = GlobalHistoryManager(
            global_history_manager_db, job_name, collection=collection)

        # set scheduler
        if scheduler:
            self._scheduler = scheduler
        elif MSPASS_SCHEDULER:
            self._scheduler = MSPASS_SCHEDULER
        else:
            self._scheduler = "dask"

        # scheduler configuration
        if self._scheduler == "spark":
            scheduler_host_has_port = False
            if scheduler_host:
                self._spark_master_url = scheduler_host
                # add spark:// prefix if not exist
                if "spark://" not in scheduler_host:
                    self._spark_master_url = "spark://" + self._spark_master_url
                # check if spark host address contains port number already
                if self._spark_master_url.count(":") == 2:
                    scheduler_host_has_port = True

            elif MSPASS_SCHEDULER_ADDRESS:
                self._spark_master_url = MSPASS_SCHEDULER_ADDRESS
                # add spark:// prefix if not exist
                if "spark://" not in MSPASS_SCHEDULER_ADDRESS:
                    self._spark_master_url = "spark://" + self._spark_master_url
            else:
                self._spark_master_url = "local"

            # add port number
            # 1. not the default 'local'
            # 2. scheduler_host and does not contain port number
            # 3. SPARK_MASTER_PORT exists
            if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS)
                    and not scheduler_host_has_port and SPARK_MASTER_PORT):
                self._spark_master_url += ":" + SPARK_MASTER_PORT

            # sanity check
            try:
                spark = (SparkSession.builder.appName("mspass").master(
                    self._spark_master_url).getOrCreate())
                self._spark_context = spark.sparkContext
            except Exception as err:
                raise MsPASSError(
                    "Runntime error: cannot create a spark configuration with: "
                    + self._spark_master_url,
                    "Fatal",
                )

        elif self._scheduler == "dask":
            # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client
            if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS:
                self._dask_client = DaskClient()
            else:
                scheduler_host_has_port = False
                # set host
                if scheduler_host:
                    self._dask_client_address = scheduler_host
                    # check if scheduler_host contains port number already
                    if ":" in scheduler_host:
                        scheduler_host_has_port = True
                else:
                    self._dask_client_address = MSPASS_SCHEDULER_ADDRESS

                # add port
                if not scheduler_host_has_port and DASK_SCHEDULER_PORT:
                    self._dask_client_address += ":" + DASK_SCHEDULER_PORT
                else:
                    # use to port 8786 by default if not specified
                    self._dask_client_address += ":8786"
                # sanity check
                try:
                    self._dask_client = DaskClient(self._dask_client_address)
                except Exception as err:
                    raise MsPASSError(
                        "Runntime error: cannot create a dask client with: " +
                        self._dask_client_address,
                        "Fatal",
                    )
Exemplo n.º 7
0
def main(args=None):
    # As a script that would be run from the shell we let
    # any functions below that throw exception do so and assume they
    # will write a message that can help debug what went wrong
    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser(
        prog="dbverify",
        usage=
        "%(prog)s dbname [-t TEST -c [collection ...] -n [normalize ... ] -error_limit n -v]",
        description="MsPASS database verify program",
    )
    parser.add_argument(
        "dbname",
        metavar="dbname",
        type=str,
        help="MongoDB database name on which to run tests",
    )
    parser.add_argument(
        "-t",
        "--test",
        action="store",
        type=str,
        default="normalization",
        help="Select which test to run.  " +
        "Current options:  normalization, required, schema_check",
    )
    parser.add_argument(
        "-c",
        "--collection",
        action="store",
        nargs="*",
        default=["wf_TimeSeries"],
        help="Collection(s) on which the test is to be run.  " +
        "Only schema_check supports multiple collections in one run",
    )
    parser.add_argument(
        "-n",
        "--normalize",
        nargs="*",
        default=["site_id", "channel_id", "source_id"],
        help="List of normalization keys to test\n" +
        "(Used only for -test normalization option",
    )
    parser.add_argument(
        "-r",
        "--require",
        nargs="*",
        default=[],
        help="List of keys of required attributes for required test",
    )
    parser.add_argument(
        "-e",
        "--error_limit",
        action="store",
        type=int,
        default=1000,
        help="Set error limit - stop checking when this many errors are found\n"
        + "Default is 1000",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help=
        "When used print offending values.  Otherwise just return a summary",
    )

    args = parser.parse_args(args)
    test_to_run = args.test
    dbname = args.dbname
    dbclient = DBClient()
    db = Database(dbclient, dbname)
    col_to_test = args.collection
    normalize = args.normalize
    reqlist = args.require
    verbose = args.verbose
    elimit = args.error_limit

    # If python had a switch case it would be used here.  this
    # is the list of known tests.  the program can only run one
    # test per execution.  Intentional to make output more readable
    if test_to_run == "normalization":
        if len(col_to_test) > 1:
            print(
                "WARNING:  normalization test can only be run on one collection at a time"
            )
            print("Parsed a list with the following contents:  ", col_to_test)
            print("Running test on the first item in that list")
        col = col_to_test[0]
        if not isinstance(col, str):
            print("Invalid value parsed for -c option=", col)
            exit(-1)
        run_check_links(db, col, normalize, elimit, verbose)
    elif test_to_run == "required":
        if len(col_to_test) > 1:
            print(
                "WARNING:  required test can only be run on one collection at a time"
            )
            print("Parsed a list with the following contents:  ", col_to_test)
            print("Running test on the first item in that list")
        col = col_to_test[0]
        if not isinstance(col, str):
            print("Invalid value parsed for -c option=", col_to_test)
            exit(-1)
        if len(reqlist) == 0:
            # Depends on default being an empty list. For default
            # case run this small function.
            # This is currently a funtion above with const list values
            # returned for each known collection.  It may eventually
            # be replaced a function using the schema
            required_list = get_required(col)
        else:
            required_list = reqlist
        run_check_required(db, col, required_list, elimit, verbose)
    elif test_to_run == "schema_check":
        for col in col_to_test:
            run_check_attribute_types(db, col, elimit, verbose)
    else:
        print("Unrecognized value for --test value parsed=", test_to_run)
        print("Must be one of:  normalization, required, or schema_check")