def initFromCatalog(dbURL, utility=False, useAllSegmentFileSpaces=False): """ Factory method, initializes a HAWQArray from provided database URL Please note that - useAllSegmentFilespaces when set to true makes this method add *all* filespaces to the segments of hawqarray. If false, only returns Master/Standby all filespaces This is *hacky* and we know that it is not the right way to design methods/interfaces We are doing this so that we do not affect behavior of existing tools like upgrade, gprecoverseg etc """ conn = dbconn.connect(dbURL, utility) # Get the version from the database: version_str = None for row in dbconn.execSQL(conn, "SELECT version()"): version_str = row[0] version = GpVersion(version_str) # Now only support HAWQ 2.x hawq_major_version = version.getVersionRelease().split('.')[0] if hawq_major_version == '2': hawq_site = HawqXMLParser(GPHOME) master_data_directory = hawq_site.get_value_from_name( 'hawq_master_directory') segment_data_directory = hawq_site.get_value_from_name( 'hawq_segment_directory') config_rows = dbconn.execSQL( conn, ''' SELECT sc.registration_order, sc.role, sc.status, sc.hostname, sc.address, sc.port, CASE WHEN sc.registration_order <= 0 THEN '%s' ELSE '%s' END AS datadir FROM pg_catalog.gp_segment_configuration sc ORDER BY sc.registration_order;''' % (master_data_directory, segment_data_directory)) # All of filesystem is shared storage filesystemRows = dbconn.execSQL( conn, ''' SELECT oid, fsysname, true AS fsysshared FROM pg_filesystem ORDER BY fsysname ''') filesystemArr = [ HAWQFilesystemObj(fsysRow[0], fsysRow[1], fsysRow[2]) for fsysRow in filesystemRows ] filespaceRows = dbconn.execSQL( conn, ''' SELECT oid, fsname, fsfsys AS fsoid FROM pg_filespace WHERE oid != %d ORDER BY fsname; ''' % (SYSTEM_FILESPACE)) filespaceArr = [ HAWQFilespaceObj( fsRow[0], fsRow[1], HAWQFilesystemObj.getFilesystemObj(filesystemArr, fsRow[2])) for fsRow in filespaceRows ] else: raise Exception("HAWQ version is invalid: %s" % version) hawqdbs = [] print "### initFromCatalog ###" hdb = None for row in config_rows: print row # Extract fields from the row (registration_order, role, status, hostname, address, port, datadir) = row # In GPSQL, only master maintain the filespace information. # if registration_order != MASTER_REGISTRATION_ORDER and \ # fsoid != SYSTEM_FILESPACE and \ # not useAllSegmentFileSpaces: # print "### initFromCatalog ... continue ###" # continue # The query returns all the filespaces for a segment on separate # rows. If this row is the same dbid as the previous row simply # add this filespace to the existing list, otherwise create a # new segment. # if seg and seg.getSegmentRegistrationOrder() == registration_order: # seg.addSegmentFilespace(fsoid, fslocation) # else: # seg = HAWQDB(registration_order, role, status, # hostname, address, port, datadir) # segments.append(seg) hdb = HAWQDB(registration_order, role, status, hostname, address, port, datadir) print "### initFromCatalog ... hdb ###" print hdb hawqdbs.append(hdb) print "### initFromCatalog ... hawqdbs ###" print hawqdbs conn.close() # origSegments = [seg.copy() for seg in segments] array = HAWQArray(hawqdbs) array.version = version array.setFilespaces(filespaceArr) array.setFilesystem(filesystemArr) return array
def initFromCatalog(dbURL, utility=False, useAllSegmentFileSpaces=False): """ Factory method, initializes a HAWQArray from provided database URL Please note that - useAllSegmentFilespaces when set to true makes this method add *all* filespaces to the segments of hawqarray. If false, only returns Master/Standby all filespaces This is *hacky* and we know that it is not the right way to design methods/interfaces We are doing this so that we do not affect behavior of existing tools like upgrade, gprecoverseg etc """ conn = dbconn.connect(dbURL, utility) # Get the version from the database: version_str = None for row in dbconn.execSQL(conn, "SELECT version()"): version_str = row[0] version = GpVersion(version_str) # Only for HAWQ 2.0 if version.getVersionRelease() in ("2.0"): hawq_site = HawqXMLParser(GPHOME) master_data_directory = hawq_site.get_value_from_name('hawq_master_directory') segment_data_directory = hawq_site.get_value_from_name('hawq_segment_directory') config_rows = dbconn.execSQL(conn, ''' SELECT sc.registration_order, sc.role, sc.status, sc.hostname, sc.address, sc.port, CASE WHEN sc.registration_order <= 0 THEN '%s' ELSE '%s' END AS datadir FROM pg_catalog.gp_segment_configuration sc ORDER BY sc.registration_order;''' % (master_data_directory, segment_data_directory)) # All of filesystem is shared storage filesystemRows = dbconn.execSQL(conn, ''' SELECT oid, fsysname, true AS fsysshared FROM pg_filesystem ORDER BY fsysname ''') filesystemArr = [HAWQFilesystemObj(fsysRow[0], fsysRow[1], fsysRow[2]) for fsysRow in filesystemRows] filespaceRows = dbconn.execSQL(conn, ''' SELECT oid, fsname, fsfsys AS fsoid FROM pg_filespace WHERE oid != %d ORDER BY fsname; ''' % (SYSTEM_FILESPACE)) filespaceArr = [HAWQFilespaceObj(fsRow[0], fsRow[1], HAWQFilesystemObj.getFilesystemObj(filesystemArr, fsRow[2])) for fsRow in filespaceRows] else: raise Exception("HAWQ version is invalid: %s" % version) hawqdbs = [] print "### initFromCatalog ###" hdb = None for row in config_rows: print row # Extract fields from the row (registration_order, role, status, hostname, address, port, datadir) = row # In GPSQL, only master maintain the filespace information. # if registration_order != MASTER_REGISTRATION_ORDER and \ # fsoid != SYSTEM_FILESPACE and \ # not useAllSegmentFileSpaces: # print "### initFromCatalog ... continue ###" # continue # The query returns all the filespaces for a segment on separate # rows. If this row is the same dbid as the previous row simply # add this filespace to the existing list, otherwise create a # new segment. # if seg and seg.getSegmentRegistrationOrder() == registration_order: # seg.addSegmentFilespace(fsoid, fslocation) # else: # seg = HAWQDB(registration_order, role, status, # hostname, address, port, datadir) # segments.append(seg) hdb = HAWQDB(registration_order, role, status, hostname, address, port, datadir) print "### initFromCatalog ... hdb ###" print hdb hawqdbs.append(hdb) print "### initFromCatalog ... hawqdbs ###" print hawqdbs conn.close() # origSegments = [seg.copy() for seg in segments] array = HAWQArray(hawqdbs) array.version = version array.setFilespaces(filespaceArr) array.setFilesystem(filesystemArr) return array
class GPCatalog(): """ Catalog is a container class that contains dictionary of CatalogTable objects. It provides the CatalogTables with a context that they can use to refer to other CatalogTables (e.g. describe foreign keys) and it provides calling code with a simple wrapper for what a known catalog layout looks like. It supports multiple source versions of the database. It issues a warning if there are catalog tables defined in the database that it is unaware of, usually indicating that it is operating against an unknown version. """ # -------------------------------------------------------------------- # Public API functions: # - Catalog() - Create a Catalog object # - getCatalogTable() - Returns a single CatalogTable # - getCatalogTables() - Returns a list of CatalogTable # - getCatalogVersion() - Returns a GpVersion # -------------------------------------------------------------------- def getCatalogTable(self, tablename): """ getCatalogTable(tablename) => Returns the specified CatalogTable Raises: CatalogException when the table does not exist """ if tablename not in self._tables: raise GPCatalogException("No such catalog table: %s" % str(tablename)) return self._tables[tablename] def getCatalogTables(self): """ getCatalogTables() => Returns a list of CatalogTable """ return list(self._tables.values()) def getCatalogVersion(self): """ getCatalogVersion() => Returns the GpVersion object """ return self._version # -------------------------------------------------------------------- # Private implementation functions: # -------------------------------------------------------------------- def __init__(self, dbConnection): """ Catalog() constructor 1) Uses the supplied database connection to get a list of catalog tables 2) iterate through the list building up CatalogTable objects 3) Mark "coordinator only" tables manually 4) Mark a couple primary keys manually 5) Mark foreign keys manually 6) Mark known catalog differences manually 7) Validate and return the Catalog object """ self._dbConnection = dbConnection self._tables = {} self._version = None self._tidycat = {} # tidycat definitions from JSON file version_query = """ SELECT version() """ catalog_query = """ SELECT oid, relname, relisshared FROM pg_class WHERE relnamespace=11 and relkind = 'r' """ # Read the catalog version from the database try: curs = self._query(version_query) except Exception as e: raise GPCatalogException("Error reading database version: " + str(e)) self._version = GpVersion(curs.getresult()[0][0]) # Read the list of catalog tables from the database try: curs = self._query(catalog_query) except Exception as e: raise GPCatalogException("Error reading catalog: " + str(e)) # Construct our internal representation of the catalog for [oid, relname, relisshared] in curs.getresult(): self._tables[relname] = GPCatalogTable(self, relname) # Note: stupid API returns t/f for boolean value self._tables[relname]._setShared(relisshared == 't') self._tables[relname]._setOid(oid) # The tidycat.pl utility has been used to generate a json file # describing aspects of the catalog that we can not currently # interrogate from the catalog itself. This includes things # like which tables are coordinator only vs segment local and what # the foreign key relationships are. self._getJson() # Which tables are "coordinator only" is not derivable from the catalog # so we have to set this manually. self._markCoordinatorOnlyTables() # We derived primary keys for most of the catalogs based on un # unique indexes, but we have to manually set a few stranglers self._setPrimaryKeys() # Foreign key relationships of the catalog tables are not actually # defined in the catalog, so must be obtained from tidycat self._setForeignKeys() # Most catalog tables are now ready to go, but some columns can # not be compared directly between segments, we need to indicate # these exceptions manually. self._setKnownDifferences() # Finally validate that everything looks right, this will issue # warnings if there are any regular catalog tables that do not # have primary keys set. self._validate() def _query(self, qry): """ Simple wrapper around querying the database connection """ return self._dbConnection.query(qry) def _markCoordinatorOnlyTables(self): """ We mark two types of catalog tables as "coordinator only" - True "coordinator only" tables - Tables we know to have different contents on coordinator/segment While the later two are not technically "coordinator only" they have the property that we cannot validate cross segment consistency, which makes them the same for our current purposes. We may want to eventually move these other types of tables into a different classification. """ for name in COORDINATOR_ONLY_TABLES: if name in self._tables: self._tables[name]._setCoordinatorOnly() for name in SEGMENT_LOCAL_TABLES: if name in self._tables: self._tables[name]._setCoordinatorOnly() def _setPrimaryKeys(self): """ Most of the catalog primary keys are set automatically in CatalogTable by looking at unique indexes over the catalogs. However there are a couple of catalog tables that do not have unique indexes that we still want to perform cross segment consistency on, for them we have to manually set a primary key """ self._tables['gp_version_at_initdb']._setPrimaryKey( "schemaversion productversion") self._tables['pg_constraint']._setPrimaryKey( "conname connamespace conrelid contypid") self._tables['pg_depend']._setPrimaryKey( "classid objid objsubid refclassid refobjid refobjsubid deptype") if self._version >= "4.0": self._tables['pg_resqueuecapability']._setPrimaryKey( "resqueueid restypid") def _getJson(self): """ Read the json file generated by tidycat which contains, among other things, the primary key/foreign key relationships for the catalog tables. Build the fkeys for each table and validate them against the catalog. """ indir = os.path.dirname(__file__) jname = str(self._version.getVersionRelease()) + ".json" try: # json doc in data subdirectory of pylib module infil = open(os.path.join(indir, "data", jname), "r") d = json.load(infil) # remove the tidycat comment if "__comment" in d: del d["__comment"] if "__info" in d: del d["__info"] infil.close() self._tidycat = d except Exception as e: # older versions of product will not have tidycat defs -- # need to handle this case logger.warn("GPCatalogTable: " + str(e)) def _setForeignKeys(self): """ Setup the foreign key relationships amongst the catalogs. We drive this based on the tidycat generate json file since this information is not derivable from the catalog. """ try: for tname, tdef in self._tidycat.items(): if "foreign_keys" not in tdef: continue for fkdef in tdef["foreign_keys"]: fk2 = GPCatalogTableForeignKey(tname, fkdef[0], fkdef[1], fkdef[2]) self._tables[tname]._addForeignKey(fk2) except Exception as e: # older versions of product will not have tidycat defs -- # need to handle this case logger.warn("GPCatalogTable: " + str(e)) def _setKnownDifferences(self): """ Some catalogs have columns that, for one reason or another, we need to mark as being different between the segments and the coordinator. These fall into two categories: - Bugs (marked with the appropriate jiras) - A small number of "special" columns """ # ------------- # Special cases # ------------- # pg_class: # - relfilenode should generally be consistent, but may not be (jira?) # - relpages/reltuples/relfrozenxid/relminmxid are all vacumm/analyze related # - relhasindex/relhaspkey are only cleared when vacuum completes # - relowner has its own checks: # => may want to separate out "owner" columns like acl and oid self._tables['pg_class']._setKnownDifferences( "relfilenode relpages reltuples relhasindex relhaspkey relowner relfrozenxid relminmxid relallvisible" ) # pg_type: typowner has its own checks: # => may want to separate out "owner" columns like acl and oid self._tables['pg_type']._setKnownDifferences("typowner") # pg_database: datfrozenxid and datminmxid are vacuum related self._tables['pg_database']._setKnownDifferences( "datfrozenxid datminmxid") # ------------- # Issues still present in the product # ------------- # MPP-11289 : inconsistent OIDS for table "default values" self._tables['pg_attrdef']._setKnownDifferences("oid") # MPP-11284 : inconsistent OIDS for constraints self._tables['pg_constraint']._setKnownDifferences("oid") # MPP-11282: Inconsistent oids for language callback functions # MPP-12015: Inconsistent oids for operator communtator/negator functions self._tables['pg_proc']._setKnownDifferences("oid prolang") # MPP-11282: pg_language oids and callback functions self._tables['pg_language']._setKnownDifferences( "oid lanplcallfoid lanvalidator") # MPP-12015: Inconsistent oids for operator communtator/negator functions # MPP-12015: Inconsistent oids for operator sort/cmp operators self._tables['pg_operator']._setKnownDifferences( "oid oprcom oprnegate oprlsortop oprrsortop oprltcmpop oprgtcmpop") self._tables['pg_aggregate']._setKnownDifferences("aggsortop") # MPP-11281 : Inconsistent oids for views self._tables['pg_rewrite']._setKnownDifferences("oid ev_action") # MPP-11285 : Inconsistent oids for triggers self._tables['pg_trigger']._setKnownDifferences("oid") # MPP-11575 : Inconsistent handling of indpred for partial indexes # indcheckxmin column related to HOT feature in pg_index is calculated # independently for coordinator and segment based on individual nodes # transaction state, hence it can be different so skip it from checks. self._tables['pg_index']._setKnownDifferences("indpred indcheckxmin") def _validate(self): """ Check that all tables defined in the catalog have either been marked as "coordinator only" or have a primary key """ for relname in sorted(self._tables): if self._tables[relname].isCoordinatorOnly(): continue if self._tables[relname].getPrimaryKey() == []: logger.warn( "GPCatalogTable: unable to derive primary key for %s" % str(relname))