def open(self): if self._conn: log.info(_("The repository file `%s' has already been opened.") % self.filename) return repo_found = False try: repo_found = os.path.exists(self.filename) except Exception as e: raise DbProfilerException( _("Could not access to the repository file `%s'.") % self.filename) if repo_found is False: raise InternalError(_("The repository file `%s' not found.") % self.filename) try: self._conn = sqlite3.connect(self.filename) except Exception as e: raise DbProfilerException( _("Could not read the repository file `%s'.") % self.filename) assert self._conn log.info(_("The repository file `%s' has been opened.") % self.filename) return
def validate_sql(self, dbdriver): if dbdriver is None: raise DriverError(u'Database driver not found.') validated_count = 0 failed_count = 0 for label in self.sql_validators: validator = self.sql_validators[label] log.info(_("Validating with SQL: %s") % '; '.join(validator.rule)) validated_count += 1 try: res = validator.validate(dbdriver) except ValidationError as e: log.error(_("SQL validation error: %s") % '; '.join(validator.rule), detail=e.source.value if e.source else None) self._column_counter.incr(validator.rule[0], validator.label) failed_count += 1 continue if res is False: self._column_counter.incr(validator.rule[0], validator.label) failed_count += 1 return (validated_count, failed_count)
def destroy(self): try: if os.path.exists(self.filename): os.unlink(self.filename) except Exception as e: log.error(_("Could not destroy the repository."), detail=unicode(e)) return False log.info(_("The repository has been destroyed.")) return True
def export_file(filename, body): try: f = open(filename, "w") f.write(body.encode('utf-8')) f.close() log.info(_("Generated %s.") % filename) except IOError as e: log.error(_("Could not generate %s: %s") % (filename, unicode(e))) return False return True
def init(self): try: if os.path.exists(self.filename): log.info(_("The repository already exists.")) return True self.__init_sqlite3(self.filename) except Exception as e: log.error(_("Could not create the repository."), detail=unicode(e)) return False log.info(_("The repository has been initialized.")) return True
def create_validation_rule(self, database_name, schema_name, table_name, column_name, description, rule, param='', param2=''): """ Args: database_name(str): schema_name(str): table_name(str): column_name(str): description(str): rule(str): param(str): param2(str): Returns: integer when the rule successfully gets created. None when already exists. """ r = self.get_validation_rules(database_name, schema_name, table_name, column_name, description, rule, param, param2) assert len(r) <= 1 if r: log.warning((_("The same validation rule already exists: ") + u"{0},{1},{2},{3},{4},{5},{6},{7}" .format(database_name, schema_name, table_name, column_name, description, rule, param, param2))) return None query = u""" INSERT INTO validation_rule (id,database_name,schema_name,table_name, column_name,description,rule,param,param2) VALUES ((SELECT coalesce(max(id),0)+1 FROM validation_rule), '{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}'); """.format(database_name, schema_name, table_name, column_name, description, rule, '' if param is None else "%s" % param.replace("'", "''"), '' if param2 is None else "%s" % param2.replace("'", "''")) log.trace("create_validation_rule: %s" % query.replace('\n', '')) id = None try: cursor = self._conn.cursor() cursor.execute(query) cursor.execute("SELECT max(id) FROM validation_rule") id = cursor.fetchone()[0] self._conn.commit() except Exception as e: raise InternalError(_("Could not register validation rule: "), query=query, source=e) return id
def connect(self): if self.dbconn is None: log.info(_("Connecting the database.")) try: self.dbdriver.connect() except DbProfilerException as e: log.error(_("Could not connect to the database."), detail=e.source) log.error(_("Abort.")) sys.exit(1) self.dbconn = self.dbdriver.conn log.info(_("Connected to the database.")) return True
def add_rule(self, id_, database_name, schema_name, table_name, column_name, description, rule, param, param2=None): assert isinstance(id_, int) if self.schema_name != schema_name or self.table_name != table_name: return False label = id_ log.debug("add_rule: label = %s" % label) assert param if rule == 'regexp': self.add_rule_regexp(label, column_name, param) elif rule == 'eval': self.add_rule_eval(label, column_name, param) elif rule == 'columnstat': self.add_rule_columnstat(label, column_name, param) elif rule == 'sql': assert param2 self.add_rule_sql(label, column_name, param, param2) else: raise InternalError(_("Unsupported validation rule: %s") % rule) self.descriptions[label] = description return True
def get_table_list(self, database_name=None, schema_name=None, table_name=None): table_list = [] cond = [] if database_name: cond.append("database_name = '%s'" % database_name) if schema_name: cond.append("schema_name = '%s'" % schema_name) if table_name: cond.append("table_name = '%s'" % table_name) where = "WHERE (%s)" % " AND ".join(cond) if cond else '' query = """ SELECT DISTINCT database_name, schema_name, table_name FROM repo {0} ORDER BY database_name, schema_name, table_name """.format(where) log.trace("get_table_list: query = %s" % query) try: cursor = self._conn.cursor() for r in cursor.execute(query): table_list.append([r[0], r[1], r[2]]) except Exception as e: log.error(_("Could not get data."), detail=unicode(e)) return None return table_list
def get_datamap_items(self, database_name, schema_name, table_name, column_name=None): """Get one or more datamap entries from the repository Args: database_name (str): schema_name_name (str): table_name (str): column_name (str): Returns: list: a list which consists of one or more datamap entries. """ assert database_name and schema_name and table_name query = u""" SELECT data FROM datamapping WHERE database_name = '%s' AND schema_name = '%s' AND table_name = '%s' """ % (database_name, schema_name, table_name) if column_name: query = query + u" AND column_name = '%s'" % column_name query = query + u"ORDER BY lineno" datamap = [] try: cursor = self._conn.cursor() for r in cursor.execute(query): datamap.append(json.loads(r[0])) except Exception as e: raise InternalError(_("Could not get data."), query=query) return datamap
def export_json(repo, tables=[], output_path='./json'): json_data = [] try: f = open(output_path + "/EXPORT.JSON", "a") for tab in tables: database_name = tab[0] schema_name = tab[1] table_name = tab[2] data = repo.get_table(database_name, schema_name, table_name) json_data.append(data) f.write(json.dumps(json_data, indent=2).encode('utf-8')) f.close() log.info(_("Generated JSON file.")) except IOError, e: log.error(_("Could not generate JSON file.")) sys.exit(1)
def get_validation_rule(self, id): """ Args: id(integer): Returns: tuple: (id,database_name,schema_name,table_name,column_name, description,rule,param,param2) or None """ query = (u"SELECT id,database_name,schema_name,table_name,column_name," u"description,rule,param,param2 FROM validation_rule " u"WHERE id = %d" % id) log.trace("get_validation_rule: %s" % query.replace('\n', '')) tup = None try: cursor = self._conn.cursor() cursor.execute(query) r = cursor.fetchone() if r: tup = tuple(r) except Exception as e: raise InternalError(_("Could not get validation rule: "), query=query, source=e) return tup
def validate_table(self, table_data): validated_count = 0 failed_count = 0 # Run statistics validators. for label in self.statistics_validators: validator = self.statistics_validators[label] log.info(_("Validating column statistics: %s") % '; '.join(validator.rule)) validated_count += 1 try: res = validator.validate(table_data) except ValidationError as e: log.error(u'%s' % e.value) res = False if res is False: log.trace("VALIDATION FAILED: %s %s %s" % (validator.label, unicode(validator.rule), validator.column_names)) self._column_counter.incr(validator.rule[0], validator.label) failed_count += 1 else: log.trace("VALIDATION OK: %s %s %s" % (validator.label, unicode(validator.rule), validator.column_names)) return (validated_count, failed_count)
def create_popover_content(term): assert isinstance(term, dict) synonyms = ', '.join(term.get('synonyms', [])) if synonyms: synonyms = u'<br/>%s: ' % _("Synonym") + synonyms related = ', '.join(term.get('related_terms', [])) if related: related = u'<br/>%s: ' % _("Related Terms") + related assets = ', '.join(term.get('assigned_assets', [])) if assets: assets = u'<br/>%s: ' % _("Assigned Assets") + assets content = (u"{1}<br/>{2}{3}{4}<div align=right><a href='glossary.html#{0}'" u" target='_glossary'>{5}</a></div>".format (term['term'], term['description_short'], synonyms, related, assets, _("Details..."))) return content
def append_table(self, tab): """ Update a table record if the same record (with same timestamp) already exist. Otherwise, append the table record to the repository. Args: tab: a dictionary of table record. Returns: True on success, otherwise False. """ assert (tab['database_name'] and tab['schema_name'] and tab['table_name'] and tab['timestamp']) query = None log.trace("append_table: start %s.%s.%s" % (tab['database_name'], tab['schema_name'], tab['table_name'])) try: if self.has_table_record(tab): query = """ UPDATE repo SET data = '%s' WHERE database_name = '{database_name}' AND schema_name = '{schema_name}' AND table_name = '{table_name}' AND created_at = datetime('{timestamp}') """.format(**tab) % DbProfilerFormatter.jsonize(tab).replace("'", "''") else: query = """ INSERT INTO repo VALUES ('{database_name}','{schema_name}','{table_name}', datetime('{timestamp}'), '%s') """.format(**tab) % DbProfilerFormatter.jsonize(tab).replace("'", "''") log.trace("append_table: INSERT") log.debug("append_table: query = %s" % query) assert self._conn cursor = self._conn.cursor() assert cursor cursor.execute(query) self._conn.commit() except Exception as e: raise InternalError(_("Could not register table data: "), query=query, source=e) # Remove all tag id/label pairs to replace with new ones. tagid = "%s.%s.%s" % (tab['database_name'], tab['schema_name'], tab['table_name']) self.delete_tag_id(tagid) if tab.get('tags'): for label in tab['tags']: self.put_tag(tagid, label) log.trace("append_table: end") return True
def get_tags(self): """Get a list of tag names and number of tags associated with tables. Returns: list: a list of lists: [[tag,num of tables], ...] """ log.trace("get_tags: start") query = """ SELECT tag_label, COUNT(*) FROM tags WHERE tag_label <> '' GROUP BY tag_label ORDER BY COUNT(*) DESC """ tags = [] try: cursor = self._conn.cursor() log.debug("get_tags: query = %s" % query) for r in cursor.execute(query): tags.append([r[0], r[1]]) except Exception as e: log.trace("get_tags: " + unicode(e)) raise InternalError(_("Could not get tag info: "), query=query, source=e) log.trace("get_tags: end") return tags
def get(self): jsondata = u"" try: data_all = [] cursor = self._conn.cursor() for r in cursor.execute("SELECT * FROM repo"): data_all.append(json.loads(unicode(r[4]))) log.info(_("Retrieved all data from the repository `%s'.") % self.filename) except Exception as e: log.error(_("Could not retreive from the repository `%s'") % self.filename, detail=unicode(e)) return None return data_all
def run_postscan_validation(self, schema_name, table_name, tablemeta, columnmeta, table_data, validation_rules): if not validation_rules: return table_data v = DbProfilerValidator.DbProfilerValidator(table_data['schema_name'], table_data['table_name'], self, validation_rules) log.info(_("Column statistics validation: start")) validated1, failed1 = v.validate_table(table_data) log.info(_("Column statistics validation: end (%d)") % validated1) log.info(_("SQL validation: start")) validated2, failed2 = v.validate_sql(self.dbdriver) log.info(_("SQL validation: end (%d)") % validated2) v.update_table_data(table_data) return table_data
def _run_record_validation(self, schema_name, table_name, tablemeta, columnmeta, validation_rules, skip_record_validation): log.info(_("Record validation: start")) if skip_record_validation: log.info(_("Record validation: skipping")) return if not validation_rules: log.info(_("Record validation: no validation rule")) return validation = self.run_record_validation(schema_name, table_name, validation_rules) assert isinstance(validation, dict) for col in tablemeta.column_names: if validation and col in validation: columnmeta[col].validation = validation[col] log.info(_("Record validation: end"))
def get_tag_labels(self, tag_id): labels = [] try: cursor = self._conn.cursor() query = u"SELECT tag_label FROM tags WHERE tag_id = '%s'" % tag_id for r in cursor.execute(query): labels.append(r[0]) except Exception as e: raise InternalError(_("Could not get tag labels: "), query=query, source=e) return labels
def delete_textelement(self, id_): log.trace('delete_textelement: start') try: cursor = self._conn.cursor() query = u"DELETE FROM textelement WHERE id_= '%s'" % id_ cursor.execute(query) self._conn.commit() except Exception as e: raise InternalError(_("Could not delete text element: "), query=query, source=e) log.trace('delete_textelement: end') return True
def delete_tag_id(self, tag_id): log.trace('delete_tag_id: start %s' % tag_id) try: cursor = self._conn.cursor() query = u"DELETE FROM tags WHERE tag_id = '%s'" % tag_id cursor.execute(query) self._conn.commit() except Exception as e: raise InternalError(_("Could not delete tag id: "), query=query, source=e) log.trace('delete_tag_id: end') return True
def put_textelement(self, id_, text): log.trace('put_textelement: start') try: cursor = self._conn.cursor() query = (u"INSERT INTO textelement VALUES ('%s', '%s')" % (id_, text if text else '')) cursor.execute(query) self._conn.commit() except Exception as e: raise InternalError(_("Could not register text element: "), query=query, source=e) log.trace('put_textelement: end') return True
def get_textelements(self, id_): log.trace('get_textelements: start') texts = [] try: cursor = self._conn.cursor() query = u"SELECT text_ FROM textelement WHERE id_= '%s'" % id_ for r in cursor.execute(query): texts.append(r[0]) except Exception as e: raise InternalError(_("Could not get text element: "), query=query, source=e) log.trace('get_textelements: end') return texts
def set(self, data): try: cursor = self._conn.cursor() cursor.execute("DELETE FROM repo") self._conn.commit() except Exception as e: log.error(_("Could not initialize the repository."), detail=unicode(e)) return False for d in data: self.append_table(d) return True
def verify(self): repo = DbProfilerRepository.DbProfilerRepository(self.repofile) repo.open() log.info(_("Verifying the validation results.")) table_list = repo.get_table_list() valid = 0 invalid = 0 for t in table_list: table = repo.get_table(t[0], t[1], t[2]) v, i = verify_table(table) valid += v invalid += i if invalid == 0: log.info( _("No invalid results: %d/%d") % (invalid, valid + invalid)) else: log.info(_("Invalid results: %d/%d") % (invalid, valid + invalid)) repo.close() return (True if invalid > 0 else False)
def get_files(self, objtype, objid): """Get file names assigned to the object. Args: objtype(str): object type ['tag','schema','table'] objid(str): object identifier Returns: list: a list of file names. """ if objtype not in ['tag', 'schema', 'table']: raise InternalError(_('invalid object type: %s') % objtype) id_ = u'%s:%s' % (objtype, objid) return self.get_textelements(id_)
def delete_files(self, objtype, objid): """Remove file names associated to the object. Args: objtype(str): object type ['tag','schema','table'] objid(str): object identifier Returns: bool: True if succeeded. """ if objtype not in ['tag', 'schema', 'table']: raise InternalError(_('invalid object type: %s') % objtype) id_ = u'%s:%s' % (objtype, objid) return self.delete_textelement(id_)
def update_validation_rule(self, id, database_name, schema_name, table_name, column_name, description, rule, param=None, param2=None): """ Args: id(integer): database_name(str): schema_name(str): table_name(str): column_name(str): description(str): rule(str): param(str): param2(str): Returns: True when the rule successfully gets updated, otherwise False. """ query = u""" UPDATE validation_rule SET database_name = '{0}', schema_name = '{1}', table_name = '{2}', column_name = '{3}', description = '{4}', rule = '{5}', param = '{6}', param2 = '{7}' WHERE id = {8} """.format(database_name, schema_name, table_name, column_name, description, rule, '' if param is None else "%s" % param, '' if param2 is None else "%s" % param2, id) log.trace("update_validation_rule: %s" % query.replace('\n', '')) rowcount = 0 try: cursor = self._conn.cursor() cursor.execute(query) rowcount = cursor.rowcount self._conn.commit() except Exception as e: raise InternalError(_("Could not update validation rule: "), query=query, source=e) if rowcount == 0: return False return True
def get_bg_terms_all(self): """ Returns: list: a list of terms in the business glossary. """ query = (u"SELECT term FROM business_glossary WHERE is_latest = 1 " u"ORDER BY length(term) desc,term") try: cursor = self._conn.cursor() cursor.execute(query) data = [] for r in cursor.fetchall(): data.append(r[0]) except Exception as e: raise InternalError(_("Could not get a list of business terms: "), query=query, source=e) return data