예제 #1
0
파일: pgtd.py 프로젝트: nandi6uc/ticdat
    def find_duplicates(self, engine, schema, active_fld=""):
        """
        Find the row counts for duplicated rows.

        :param engine: A sqlalchemy Engine object that can connect to our postgres instance

        :param schema: Name of the schema within the engine's database to use

        :param active_fld: if provided, a string for a boolean filter field.
                           Must be compliant w PG naming conventions, which are different from ticdat field naming
                           conventions. Typically developer can ignore this argument, designed for expert support.

        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the postgres table with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        verify(sa, "sqlalchemy needs to be installed to use this subroutine")
        self._check_good_pgtd_compatible_table_field_names()
        if not self._duplicate_focused_tdf:
            return {}

        return find_duplicates(
            PostgresTicFactory(self._duplicate_focused_tdf).create_tic_dat(
                engine, schema, active_fld=active_fld),
            self._duplicate_focused_tdf)
예제 #2
0
 def find_duplicates(self, mdb_file_path):
     """
     Find the row counts for duplicated rows.
     :param mdb_file_path: An Access db with a consistent schema.
     :return: A dictionary whose keys are table names for the primary-ed key tables.
              Each value of the return dictionary is itself a dictionary.
              The inner dictionary is keyed by the primary key values encountered in the table,
              and the value is the count of records in the mdb table with this primary key.
              Row counts smaller than 2 are pruned off, as they aren't duplicates
     """
     _standard_verify(self.tic_dat_factory.generic_tables)
     if not self._duplicate_focused_tdf:
         return {}
     return find_duplicates(
         self._duplicate_focused_tdf.mdb.create_tic_dat(mdb_file_path),
         self._duplicate_focused_tdf)
예제 #3
0
 def find_duplicates(self, db_file_path):
     """
     Find the row counts for duplicated rows.
     :param db_file_path: A SQLite db with a consistent schema.
     :return: A dictionary whose keys are table names for the primary-ed key tables.
              Each value of the return dictionary is itself a dictionary.
              The inner dictionary is keyed by the primary key values encountered in the table,
              and the value is the count of records in the SQLite table with this primary key.
              Row counts smaller than 2 are pruned off, as they aren't duplicates
     """
     verify(sql, "sqlite3 needs to be installed to use this subroutine")
     if not self._duplicate_focused_tdf:
         return {}
     return find_duplicates(
         self._duplicate_focused_tdf.sql.create_tic_dat(db_file_path),
         self._duplicate_focused_tdf)
예제 #4
0
 def find_duplicates(self, inputset, raw_data=False):
     """
     Find the row counts for duplicated rows.
     :param inputset: An opalytics inputset consistent with this TicDatFactory
     :param raw_data: boolean. should data cleaning be skipped? See create_tic_dat.
     :return: A dictionary whose keys are table names for the primary-ed key tables.
              Each value of the return dictionary is itself a dictionary.
              The inner dictionary is keyed by the primary key values encountered in the table,
              and the value is the count of records in the table with this primary key.
              Row counts smaller than 2 are pruned off, as they aren't duplicates.
     """
     message = []
     verify(
         self._good_inputset(inputset, message.append),
         "inputset is inconsistent with this TicDatFactory : %s" %
         (message or [None])[0])
     if not self._duplicate_focused_tdf:
         return {}
     tdf = self._duplicate_focused_tdf
     return find_duplicates(
         tdf.opalytics.create_tic_dat(inputset, raw_data=raw_data), tdf)
예제 #5
0
    def find_duplicates(self, engine, schema):
        """
        Find the row counts for duplicated rows.

        :param engine: A sqlalchemy Engine object that can connect to our postgres instance

        :param schema: Name of the schema within the engine's database to use

        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the postgres table with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        verify(sa, "sqlalchemy needs to be installed to use this subroutine")
        self._check_good_pgtd_compatible_table_field_names()
        if not self._duplicate_focused_tdf:
            return {}

        return find_duplicates(
            PostgresTicFactory(self._duplicate_focused_tdf).create_tic_dat(
                engine, schema), self._duplicate_focused_tdf)