def validate_pandas(conn_cnx, sql, cases, col_count, method='one', data_type='float', epsilon=None, scale=0, timezone=None): """Tests that parameters can be customized. Args: conn_cnx: Connection object. sql: SQL command for execution. cases: Test cases. col_count: Number of columns in dataframe. method: If method is 'batch', we fetch dataframes in batch. If method is 'one', we fetch a single dataframe containing all data (Default value = 'one'). data_type: Defines how to compare values (Default value = 'float'). epsilon: For comparing double values (Default value = None). scale: For comparing time values with scale (Default value = 0). timezone: For comparing timestamp ltz (Default value = None). """ row_count = len(cases) assert col_count != 0, '# of columns should be larger than 0' with conn_cnx() as cnx_table: # fetch dataframe with new arrow support cursor_table = cnx_table.cursor() cursor_table.execute(SQL_ENABLE_ARROW) cursor_table.execute(sql) # build dataframe total_rows, total_batches = 0, 0 start_time = time.time() if method == 'one': df_new = cursor_table.fetch_pandas_all() total_rows = df_new.shape[0] else: for df_new in cursor_table.fetch_pandas_batches(): total_rows += df_new.shape[0] total_batches += 1 end_time = time.time() print('new way (fetching {}) took {}s'.format(method, end_time - start_time)) if method == 'batch': print('new way has # of batches : {}'.format(total_batches)) cursor_table.close() assert total_rows == row_count, 'there should be {} rows, but {} rows'.format(row_count, total_rows) # verify the correctness # only do it when fetch one dataframe if method == 'one': assert (row_count, col_count) == df_new.shape, 'the shape of old dataframe is {}, ' \ 'the shape of new dataframe is {}, ' \ 'shapes are not equal'.format((row_count, col_count), df_new.shape) for i in range(row_count): for j in range(col_count): c_new = df_new.iat[i, j] if cases[i] == "NULL": assert c_new is None or pd.isnull(c_new), '{} row, {} column: original value is NULL, ' \ 'new value is {}, values are not equal'.format( i, j, c_new) else: if data_type == 'float': c_case = float(cases[i]) elif data_type == 'decimal': c_case = Decimal(cases[i]) elif data_type == 'date': c_case = datetime.strptime(cases[i], '%Y-%m-%d').date() elif data_type == 'time': time_str_len = 8 if scale == 0 else 9 + scale c_case = cases[i].strip()[:time_str_len] c_new = str(c_new).strip()[:time_str_len] assert c_case == c_new, '{} row, {} column: original value is {}, ' \ 'new value is {}, ' \ 'values are not equal'.format(i, j, cases[i], c_new) break elif data_type.startswith('timestamp'): time_str_len = 19 if scale == 0 else 20 + scale if timezone: c_case = pd.Timestamp(cases[i][:time_str_len], tz=timezone) if data_type == 'timestamptz': c_case = c_case.tz_convert('UTC') c_case = c_case.tz_localize(None) else: c_case = pd.Timestamp(cases[i][:time_str_len]) assert c_case == c_new, '{} row, {} column: original value is {}, new value is {}, ' \ 'values are not equal'.format(i, j, cases[i], c_new) break else: c_case = cases[i] if epsilon is None: assert c_case == c_new, '{} row, {} column: original value is {}, new value is {}, ' \ 'values are not equal'.format(i, j, cases[i], c_new) else: assert abs(c_case - c_new) < epsilon, '{} row, {} column: original value is {}, ' \ 'new value is {}, epsilon is {} \ values are not equal'.format(i, j, cases[i], c_new, epsilon)
def validate_pandas( cnx_table, sql, cases, col_count, method="one", data_type="float", epsilon=None, scale=0, timezone=None, ): """Tests that parameters can be customized. Args: cnx_table: Connection object. sql: SQL command for execution. cases: Test cases. col_count: Number of columns in dataframe. method: If method is 'batch', we fetch dataframes in batch. If method is 'one', we fetch a single dataframe containing all data (Default value = 'one'). data_type: Defines how to compare values (Default value = 'float'). epsilon: For comparing double values (Default value = None). scale: For comparing time values with scale (Default value = 0). timezone: For comparing timestamp ltz (Default value = None). """ row_count = len(cases) assert col_count != 0, "# of columns should be larger than 0" cursor_table = cnx_table.cursor() cursor_table.execute(SQL_ENABLE_ARROW) cursor_table.execute(sql) # build dataframe total_rows, total_batches = 0, 0 start_time = time.time() if method == "one": df_new = cursor_table.fetch_pandas_all() total_rows = df_new.shape[0] else: for df_new in cursor_table.fetch_pandas_batches(): total_rows += df_new.shape[0] total_batches += 1 end_time = time.time() print(f"new way (fetching {method}) took {end_time - start_time}s") if method == "batch": print(f"new way has # of batches : {total_batches}") cursor_table.close() assert ( total_rows == row_count ), f"there should be {row_count} rows, but {total_rows} rows" # verify the correctness # only do it when fetch one dataframe if method == "one": assert (row_count, col_count) == df_new.shape, ( "the shape of old dataframe is {}, " "the shape of new dataframe is {}, " "shapes are not equal".format((row_count, col_count), df_new.shape) ) for i in range(row_count): for j in range(col_count): c_new = df_new.iat[i, j] if cases[i] == "NULL": assert c_new is None or pandas.isnull(c_new), ( "{} row, {} column: original value is NULL, " "new value is {}, values are not equal".format(i, j, c_new) ) else: if data_type == "float": c_case = float(cases[i]) elif data_type == "decimal": c_case = Decimal(cases[i]) elif data_type == "date": c_case = datetime.strptime(cases[i], "%Y-%m-%d").date() elif data_type == "time": time_str_len = 8 if scale == 0 else 9 + scale c_case = cases[i].strip()[:time_str_len] c_new = str(c_new).strip()[:time_str_len] assert c_new == c_case, ( "{} row, {} column: original value is {}, " "new value is {}, " "values are not equal".format(i, j, cases[i], c_new) ) break elif data_type.startswith("timestamp"): time_str_len = 19 if scale == 0 else 20 + scale if timezone: c_case = pandas.Timestamp( cases[i][:time_str_len], tz=timezone ) if data_type == "timestamptz": c_case = c_case.tz_convert("UTC") else: c_case = pandas.Timestamp(cases[i][:time_str_len]) assert c_case == c_new, ( "{} row, {} column: original value is {}, new value is {}, " "values are not equal".format(i, j, cases[i], c_new) ) break else: c_case = cases[i] if epsilon is None: assert c_case == c_new, ( "{} row, {} column: original value is {}, new value is {}, " "values are not equal".format(i, j, cases[i], c_new) ) else: assert abs(c_case - c_new) < epsilon, ( "{} row, {} column: original value is {}, " "new value is {}, epsilon is {} \ values are not equal".format( i, j, cases[i], c_new, epsilon ) )