예제 #1
0
 def read_from_file(self, filename: str) -> None:
     """
     Read DD from file.
     """
     self.rows = []  # type: List[DataDictionaryRow]
     log.debug("Opening data dictionary: {}".format(filename))
     with open(filename, 'r') as tsvfile:
         tsv = csv.reader(tsvfile, delimiter='\t')
         headers = next(tsv)
         if not all(x in headers for x in DataDictionaryRow.ROWNAMES):
             raise ValueError(
                 "Bad data dictionary file. Must be a tab-separated value "
                 "(TSV) file with the following row headings:\n"
                 "{}\n\n"
                 "but yours are:\n\n"
                 "{}".format("\n".join(DataDictionaryRow.ROWNAMES),
                             "\n".join(headers)))
         log.debug("Data dictionary has correct header. Loading content...")
         for values in tsv:
             valuedict = dict(zip(headers, values))
             ddr = DataDictionaryRow(self.config)
             try:
                 ddr.set_from_dict(valuedict)
                 ddr.check_valid()
             except ValueError:
                 log.critical("Offending input: {}".format(valuedict))
                 raise
             self.rows.append(ddr)
         log.debug("... content loaded.")
     self.clear_caches()
예제 #2
0
 def ensure_no_type_mismatch(ddr: DataDictionaryRow,
                             config_sqlatype: Union[TypeEngine, String],
                             human_type: str) -> None:
     rowtype = ddr.get_src_sqla_coltype()
     if (is_sqlatype_integer(rowtype)
             and is_sqlatype_integer(config_sqlatype)):
         # Good enough. The only integer type we use for PID/MPID is
         # BigInteger, so any integer type should fit.
         return
     if (is_sqlatype_string(rowtype)
             and is_sqlatype_string(config_sqlatype)
             and rowtype.length <= config_sqlatype.length):
         return
     raise ValueError(
         "Source column {} is marked as a {} field but its type is {}, "
         "while the config thinks it should be {}".format(
             r.get_signature(), human_type, r.get_src_sqla_coltype(),
             config_sqlatype))
예제 #3
0
 def ensure_no_type_mismatch(ddr: DataDictionaryRow,
                             config_sqlatype: Union[TypeEngine, String],
                             human_type: str) -> None:
     rowtype = ddr.get_src_sqla_coltype()
     if (is_sqlatype_integer(rowtype)
             and is_sqlatype_integer(config_sqlatype)):
         # Good enough. The only integer type we use for PID/MPID is
         # BigInteger, so any integer type should fit.
         return
     if (is_sqlatype_string(rowtype)
             and is_sqlatype_string(config_sqlatype)):
         # noinspection PyUnresolvedReferences
         if rowtype.length <= config_sqlatype.length:
             return
     raise ValueError(
         f"Source column {r.get_signature()} is marked as a "
         f"{human_type} field but its type is "
         f"{r.get_src_sqla_coltype()}, while the config thinks it "
         f"should be {config_sqlatype}")
예제 #4
0
    def read_from_source_databases(self, report_every: int = 100) -> None:
        """
        Create a draft DD from a source database.
        """
        log.info("Reading information for draft data dictionary")
        existing_signatures = set(ddr.get_signature() for ddr in self.rows)
        for pretty_dbname, db in self.config.sources.items():
            log.info("... database nice name = {}".format(pretty_dbname))
            cfg = db.srccfg
            meta = db.metadata
            i = 0
            for t in meta.sorted_tables:
                tablename = t.name
                log.info("... ... table: {}".format(tablename))
                new_rows = []  # type: List[DataDictionaryRow]
                is_patient_table = False

                # Skip table?
                if cfg.is_table_blacklisted(tablename):
                    log.debug(
                        "Skipping blacklisted table: {}".format(tablename))
                    continue
                all_col_names = [c.name for c in t.columns]
                if cfg.does_table_fail_minimum_fields(all_col_names):
                    log.debug("Skipping table {} because it fails minimum "
                              "field requirements".format(t))
                    continue

                for c in t.columns:
                    i += 1
                    if report_every and i % report_every == 0:
                        log.debug("... reading source field number "
                                  "{}".format(i))
                    columnname = c.name
                    # import pdb; pdb.set_trace()
                    # log.critical("str(coltype) == {}".format(str(c.type)))
                    # log.critical("repr(coltype) == {}".format(repr(c.type)))
                    try:
                        datatype_sqltext = str(c.type)
                    except sqlalchemy.exc.CompileError:
                        log.critical("Column that failed was: {}".format(
                            repr(c)))
                        raise
                    sqla_coltype = c.type
                    # Do not manipulate the case of SOURCE tables/columns.
                    # If you do, they can fail to match the SQLAlchemy
                    # introspection and cause a crash.
                    # Changed to be a destination manipulation (2016-06-04).
                    if cfg.is_field_blacklisted(columnname):
                        log.debug("Skipping blacklisted column: {}.{}".format(
                            tablename, columnname))
                        continue
                    comment = ''  # currently unsupported by SQLAlchemy
                    if self.config.append_source_info_to_comment:
                        comment = "[from {t}.{f}]".format(
                            t=tablename,
                            f=columnname,
                        )
                    ddr = DataDictionaryRow(self.config)
                    ddr.set_from_src_db_info(pretty_dbname,
                                             tablename,
                                             columnname,
                                             datatype_sqltext,
                                             sqla_coltype,
                                             dbconf=cfg,
                                             comment=comment)

                    # If we have this one already, skip ASAP
                    sig = ddr.get_signature()
                    if sig in existing_signatures:
                        log.debug("Skipping duplicated column: {}.{}".format(
                            tablename, columnname))
                        continue
                    existing_signatures.add(sig)

                    if ddr.contains_patient_info():
                        is_patient_table = True

                    # Checking validity slows us down, and we are after all
                    # creating these programmatically!
                    # ddr.check_valid(self.config)

                    new_rows.append(ddr)

                # Now, table-wide checks across all columns:
                if not is_patient_table:
                    for ddr in new_rows:
                        ddr.remove_scrub_from_alter_methods()
                        # Pointless to scrub in a non-patient table

                self.rows.extend(new_rows)

        log.info("... done")
        self.clear_caches()
        log.info("Revising draft data dictionary")
        for ddr in self.rows:
            if ddr.from_file:
                continue
            # Don't scrub_in non-patient tables
            if (ddr.src_table
                    not in self.get_src_tables_with_patient_info(ddr.src_db)):
                ddr._scrub = False
        log.info("... done")
        self.sort()