def read_from_file(self, filename: str) -> None: """ Read DD from file. """ self.rows = [] # type: List[DataDictionaryRow] log.debug("Opening data dictionary: {}".format(filename)) with open(filename, 'r') as tsvfile: tsv = csv.reader(tsvfile, delimiter='\t') headers = next(tsv) if not all(x in headers for x in DataDictionaryRow.ROWNAMES): raise ValueError( "Bad data dictionary file. Must be a tab-separated value " "(TSV) file with the following row headings:\n" "{}\n\n" "but yours are:\n\n" "{}".format("\n".join(DataDictionaryRow.ROWNAMES), "\n".join(headers))) log.debug("Data dictionary has correct header. Loading content...") for values in tsv: valuedict = dict(zip(headers, values)) ddr = DataDictionaryRow(self.config) try: ddr.set_from_dict(valuedict) ddr.check_valid() except ValueError: log.critical("Offending input: {}".format(valuedict)) raise self.rows.append(ddr) log.debug("... content loaded.") self.clear_caches()
def ensure_no_type_mismatch(ddr: DataDictionaryRow, config_sqlatype: Union[TypeEngine, String], human_type: str) -> None: rowtype = ddr.get_src_sqla_coltype() if (is_sqlatype_integer(rowtype) and is_sqlatype_integer(config_sqlatype)): # Good enough. The only integer type we use for PID/MPID is # BigInteger, so any integer type should fit. return if (is_sqlatype_string(rowtype) and is_sqlatype_string(config_sqlatype) and rowtype.length <= config_sqlatype.length): return raise ValueError( "Source column {} is marked as a {} field but its type is {}, " "while the config thinks it should be {}".format( r.get_signature(), human_type, r.get_src_sqla_coltype(), config_sqlatype))
def ensure_no_type_mismatch(ddr: DataDictionaryRow, config_sqlatype: Union[TypeEngine, String], human_type: str) -> None: rowtype = ddr.get_src_sqla_coltype() if (is_sqlatype_integer(rowtype) and is_sqlatype_integer(config_sqlatype)): # Good enough. The only integer type we use for PID/MPID is # BigInteger, so any integer type should fit. return if (is_sqlatype_string(rowtype) and is_sqlatype_string(config_sqlatype)): # noinspection PyUnresolvedReferences if rowtype.length <= config_sqlatype.length: return raise ValueError( f"Source column {r.get_signature()} is marked as a " f"{human_type} field but its type is " f"{r.get_src_sqla_coltype()}, while the config thinks it " f"should be {config_sqlatype}")
def read_from_source_databases(self, report_every: int = 100) -> None: """ Create a draft DD from a source database. """ log.info("Reading information for draft data dictionary") existing_signatures = set(ddr.get_signature() for ddr in self.rows) for pretty_dbname, db in self.config.sources.items(): log.info("... database nice name = {}".format(pretty_dbname)) cfg = db.srccfg meta = db.metadata i = 0 for t in meta.sorted_tables: tablename = t.name log.info("... ... table: {}".format(tablename)) new_rows = [] # type: List[DataDictionaryRow] is_patient_table = False # Skip table? if cfg.is_table_blacklisted(tablename): log.debug( "Skipping blacklisted table: {}".format(tablename)) continue all_col_names = [c.name for c in t.columns] if cfg.does_table_fail_minimum_fields(all_col_names): log.debug("Skipping table {} because it fails minimum " "field requirements".format(t)) continue for c in t.columns: i += 1 if report_every and i % report_every == 0: log.debug("... reading source field number " "{}".format(i)) columnname = c.name # import pdb; pdb.set_trace() # log.critical("str(coltype) == {}".format(str(c.type))) # log.critical("repr(coltype) == {}".format(repr(c.type))) try: datatype_sqltext = str(c.type) except sqlalchemy.exc.CompileError: log.critical("Column that failed was: {}".format( repr(c))) raise sqla_coltype = c.type # Do not manipulate the case of SOURCE tables/columns. # If you do, they can fail to match the SQLAlchemy # introspection and cause a crash. # Changed to be a destination manipulation (2016-06-04). if cfg.is_field_blacklisted(columnname): log.debug("Skipping blacklisted column: {}.{}".format( tablename, columnname)) continue comment = '' # currently unsupported by SQLAlchemy if self.config.append_source_info_to_comment: comment = "[from {t}.{f}]".format( t=tablename, f=columnname, ) ddr = DataDictionaryRow(self.config) ddr.set_from_src_db_info(pretty_dbname, tablename, columnname, datatype_sqltext, sqla_coltype, dbconf=cfg, comment=comment) # If we have this one already, skip ASAP sig = ddr.get_signature() if sig in existing_signatures: log.debug("Skipping duplicated column: {}.{}".format( tablename, columnname)) continue existing_signatures.add(sig) if ddr.contains_patient_info(): is_patient_table = True # Checking validity slows us down, and we are after all # creating these programmatically! # ddr.check_valid(self.config) new_rows.append(ddr) # Now, table-wide checks across all columns: if not is_patient_table: for ddr in new_rows: ddr.remove_scrub_from_alter_methods() # Pointless to scrub in a non-patient table self.rows.extend(new_rows) log.info("... done") self.clear_caches() log.info("Revising draft data dictionary") for ddr in self.rows: if ddr.from_file: continue # Don't scrub_in non-patient tables if (ddr.src_table not in self.get_src_tables_with_patient_info(ddr.src_db)): ddr._scrub = False log.info("... done") self.sort()