def get_number_of_rows(db_engine: Engine, t: Table, sampling: int = 0): sampling = int(sampling) if 100 > sampling > 0: query_total = select([func.count().label('num')]).select_from( t.tablesample(sampling, name='alias', seed=text('{}'.format(SEED)))) else: query_total = select([func.count().label('num')]).select_from(alias(t)) res_t: ResultProxy = db_engine.execute(query_total) total_rows = res_t.first()['num'] res_t.close() return total_rows
def check_uniqueness(db_engine: Engine, table: Table, comb, total_rows: int = None, sampling: int = 0): if comb.__len__() == 0: return False fields = [c for c in comb] if not total_rows: total_rows = get_number_of_rows(db_engine, table, sampling) sampling = int(sampling) if 100 > sampling > 0: sample_t = table.tablesample(sampling, name='alias', seed=text('{}'.format(SEED))) sample_fields = [sample_t.columns[fn.name] for fn in comb] query_unique = select([func.count().label('num')]).select_from( alias(select(sample_fields).distinct())) res_u: ResultProxy = db_engine.execute(query_unique) else: if db_supports_checksum(db_engine): checksum_method = get_checksum_function(db_engine) query_unique = select( [func.count(checksum_method(*fields).distinct()).label('num')]) else: query_unique = select([func.count().label('num')]).select_from( alias(select(fields).distinct())) res_u: ResultProxy = db_engine.execute(query_unique) unique_len = res_u.first()['num'] res_u.close() return total_rows == unique_len, total_rows, unique_len