def assert_tab_content_equal(fp_x, fp_y): """ Test for equality of tab files, only down to level of content - should not be taken as canonical equality, but rather that all the expected content matches to both input files, but not the order in which they appear. For more precise equality, you will need to apply a configuration - use assert_tab_equal_by_config(fp_x, fp_y, config) :param fp_x: File descriptor of a ISAtab file :param fp_y: File descriptor of another ISAtab file :return: True or False plus any AssertionErrors """ def _assert_df_equal(x, y): # need to sort values to loosen up how equality is calculated try: assert_frame_equal(x.sort_values(by=x.columns[0]), y.sort_values(by=y.columns[0])) return True except AssertionError as e: print(e) return False from os.path import basename if basename(fp_x.name).startswith("i_"): df_dict_x = read_investigation_file(fp_x) df_dict_y = read_investigation_file(fp_y) eq = True for k in df_dict_x.keys(): dfx = df_dict_x[k] dfy = df_dict_y[k] if not isinstance(dfx, list): if not _assert_df_equal(dfx, dfy): eq = False break else: try: for x, y in zip(sorted(dfx), sorted(dfy)): if not _assert_df_equal(x, y): eq = False break except ValueError as e: print(e) return eq else: def diff(a, b): b = set(b) return [aa for aa in a if aa not in b] import numpy as np df_x = pd.read_csv(fp_x, sep="\t", encoding="utf-8") df_y = pd.read_csv(fp_y, sep="\t", encoding="utf-8") try: # drop empty columns df_x = df_x.replace("", np.nan) df_x = df_x.dropna(axis=1, how="all") df_x = df_x.replace(np.nan, "") df_y = df_y.replace("", np.nan) df_y = df_y.dropna(axis=1, how="all") df_y = df_y.replace(np.nan, "") is_cols_equal = set([x.split(".", 1)[0] for x in df_x.columns]) == set( [x.split(".", 1)[0] for x in df_y.columns] ) if not is_cols_equal: print("x: " + str(df_x.columns)) print("y: " + str(df_y.columns)) print(diff(df_x.columns, df_y.columns)) raise AssertionError("Columns in x do not match those in y") # reindex to add contexts for duplicate named columns (i.e. Term Accession Number, Unit, etc.) import re char_regex = re.compile("Characteristics\[(.*?)\]") pv_regex = re.compile("Parameter Value\[(.*?)\]") fv_regex = re.compile("Factor Value\[(.*?)\]") newcolsx = list() for col in df_x.columns: newcolsx.append(col) for i, col in enumerate(df_x.columns): if char_regex.match(col) or pv_regex.match(col) or fv_regex.match(col): try: if "Unit" in df_x.columns[i + 1]: newcolsx[i + 1] = col + "/Unit" if "Term Source REF" in df_x.columns[i + 2]: newcolsx[i + 2] = col + "/Unit/Term Source REF" if "Term Accession Number" in df_x.columns[i + 3]: newcolsx[i + 3] = col + "/Unit/Term Accession Number" elif "Term Source REF" in df_x.columns[i + 1]: newcolsx[i + 1] = col + "/Term Source REF" if "Term Accession Number" in df_x.columns[i + 2]: newcolsx[i + 2] = col + "/Term Accession Number" except IndexError: pass df_x.columns = newcolsx newcolsy = list() for col in df_y.columns: newcolsy.append(col) for i, col in enumerate(df_y.columns): if char_regex.match(col) or pv_regex.match(col) or fv_regex.match(col): try: if "Unit" in df_y.columns[i + 1]: newcolsy[i + 1] = col + "/Unit" if "Term Source REF" in df_y.columns[i + 2]: newcolsy[i + 2] = col + "/Unit/Term Source REF" if "Term Accession Number" in df_y.columns[i + 3]: newcolsy[i + 3] = col + "/Unit/Term Accession Number" elif "Term Source REF" in df_y.columns[i + 1]: newcolsy[i + 1] = col + "/Term Source REF" if "Term Accession Number" in df_y.columns[i + 2]: newcolsy[i + 2] = col + "/Term Accession Number" except IndexError: pass df_y.columns = newcolsy for colx in df_x.columns: for eachx, eachy in zip(df_x.sort_values(by=colx)[colx], df_y.sort_values(by=colx)[colx]): if eachx != eachy: print(df_x[colx]) print(df_y[colx]) raise AssertionError("Value: " + str(eachx) + ", does not match: " + str(eachy)) # print("Well, you got here so the files must be same-ish... well done, you!") return True except AssertionError as e: print(str(e)) return False
def assert_tab_content_equal(fp_x, fp_y): """ Test for equality of tab files, only down to level of content - should not be taken as canonical equality, but rather that all the expected content matches to both input files, but not the order in which they appear. For more precise equality, you will need to apply a configuration - use assert_tab_equal_by_config(fp_x, fp_y, config) :param fp_x: File descriptor of a ISAtab file :param fp_y: File descriptor of another ISAtab file :return: True or False plus any AssertionErrors """ def _assert_df_equal( x, y): # need to sort values to loosen up how equality is calculated try: assert_frame_equal(x.sort_values(by=x.columns[0]), y.sort_values(by=y.columns[0])) return True except AssertionError as e: print(e) return False from os.path import basename if basename(fp_x.name).startswith('i_'): df_dict_x = read_investigation_file(fp_x) df_dict_y = read_investigation_file(fp_y) eq = True for k in df_dict_x.keys(): dfx = df_dict_x[k] dfy = df_dict_y[k] if not isinstance(dfx, list): if not _assert_df_equal(dfx, dfy): eq = False break else: try: for x, y in zip(sorted(dfx), sorted(dfy)): if not _assert_df_equal(x, y): eq = False break except ValueError as e: print(e) return eq else: def diff(a, b): b = set(b) return [aa for aa in a if aa not in b] import numpy as np df_x = pd.read_csv(fp_x, sep='\t', encoding='utf-8') df_y = pd.read_csv(fp_y, sep='\t', encoding='utf-8') try: # drop empty columns df_x = df_x.replace('', np.nan) df_x = df_x.dropna(axis=1, how='all') df_x = df_x.replace(np.nan, '') df_y = df_y.replace('', np.nan) df_y = df_y.dropna(axis=1, how='all') df_y = df_y.replace(np.nan, '') is_cols_equal = set([ x.split('.', 1)[0] for x in df_x.columns ]) == set([x.split('.', 1)[0] for x in df_y.columns]) if not is_cols_equal: print('x: ' + str(df_x.columns)) print('y: ' + str(df_y.columns)) print(diff(df_x.columns, df_y.columns)) raise AssertionError("Columns in x do not match those in y") # reindex to add contexts for duplicate named columns (i.e. Term Accession Number, Unit, etc.) import re char_regex = re.compile('Characteristics\[(.*?)\]') pv_regex = re.compile('Parameter Value\[(.*?)\]') fv_regex = re.compile('Factor Value\[(.*?)\]') newcolsx = list() for col in df_x.columns: newcolsx.append(col) for i, col in enumerate(df_x.columns): if char_regex.match(col) or pv_regex.match( col) or fv_regex.match(col): try: if 'Unit' in df_x.columns[i + 1]: newcolsx[i + 1] = col + '/Unit' if 'Term Source REF' in df_x.columns[i + 2]: newcolsx[i + 2] = col + '/Unit/Term Source REF' if 'Term Accession Number' in df_x.columns[i + 3]: newcolsx[ i + 3] = col + '/Unit/Term Accession Number' elif 'Term Source REF' in df_x.columns[i + 1]: newcolsx[i + 1] = col + '/Term Source REF' if 'Term Accession Number' in df_x.columns[i + 2]: newcolsx[i + 2] = col + '/Term Accession Number' except IndexError: pass df_x.columns = newcolsx newcolsy = list() for col in df_y.columns: newcolsy.append(col) for i, col in enumerate(df_y.columns): if char_regex.match(col) or pv_regex.match( col) or fv_regex.match(col): try: if 'Unit' in df_y.columns[i + 1]: newcolsy[i + 1] = col + '/Unit' if 'Term Source REF' in df_y.columns[i + 2]: newcolsy[i + 2] = col + '/Unit/Term Source REF' if 'Term Accession Number' in df_y.columns[i + 3]: newcolsy[ i + 3] = col + '/Unit/Term Accession Number' elif 'Term Source REF' in df_y.columns[i + 1]: newcolsy[i + 1] = col + '/Term Source REF' if 'Term Accession Number' in df_y.columns[i + 2]: newcolsy[i + 2] = col + '/Term Accession Number' except IndexError: pass df_y.columns = newcolsy for colx in df_x.columns: for eachx, eachy in zip( df_x.sort_values(by=colx)[colx], df_y.sort_values(by=colx)[colx]): if eachx != eachy: print(df_x[colx]) print(df_y[colx]) raise AssertionError("Value: " + str(eachx) + ", does not match: " + str(eachy)) # print("Well, you got here so the files must be same-ish... well done, you!") return True except AssertionError as e: print(str(e)) return False
def assert_tab_content_equal(fp_x, fp_y): """ Test for equality of tab files, only down to level of content - should not be taken as canonical equality, but ather that all the expected content matches to both input files, but not the order in which they appear. For more precise equality, you will need to apply a configuration - use assert_tab_equal_by_config(fp_x, fp_y, config) :param fp_x: File descriptor of a ISAtab file :param fp_y: File descriptor of another ISAtab file :return: True or False plus any AssertionErrors """ def _assert_df_equal(x, y): # need to sort values to loosen up how equality is calculated try: assert_frame_equal( x.sort_values(by=x.columns[0]), y.sort_values(by=y.columns[0])) return True except AssertionError as e: lbl = datetime.datetime.now() x.to_csv('~/Downloads/test-isa-for-release/expected-{}.csv'.format(lbl)) y.to_csv('~/Downloads/test-isa-for-release/actual-{}.csv'.format(lbl)) log.error('Error thrown comparing two dataframes: {}'.format(e)) log.error('x columns are: {}'.format(x.columns)) log.error('y columns are: {}'.format(y.columns)) log.error('x data are: {}'.format(x.to_numpy())) log.error('y data are: {}'.format(y.to_numpy())) return False if basename(fp_x.name).startswith('i_'): df_dict_x = read_investigation_file(fp_x) df_dict_y = read_investigation_file(fp_y) eq = True for k in df_dict_x.keys(): dfx = df_dict_x[k] dfy = df_dict_y[k] if not isinstance(dfx, list): if not _assert_df_equal(dfx, dfy): eq = False break else: try: for x, y in zip(sorted(dfx), sorted(dfy)): if not _assert_df_equal(x, y): eq = False break except ValueError as e: log.error(e) return eq else: def diff(a, b): b = set(b) return [aa for aa in a if aa not in b] import numpy as np df_x = pd.read_csv(fp_x, sep='\t', encoding='utf-8') df_y = pd.read_csv(fp_y, sep='\t', encoding='utf-8') try: # drop empty columns df_x = df_x.replace('', np.nan) df_x = df_x.dropna(axis=1, how='all') df_x = df_x.replace(np.nan, '') df_y = df_y.replace('', np.nan) df_y = df_y.dropna(axis=1, how='all') df_y = df_y.replace(np.nan, '') is_cols_equal = set( [x.split('.', 1)[0] for x in df_x.columns]) == \ set([x.split('.', 1)[0] for x in df_y.columns]) if not is_cols_equal: log.debug('x: ' + str(df_x.columns)) log.debug('y: ' + str(df_y.columns)) log.debug(diff(df_x.columns, df_y.columns)) raise AssertionError('Columns in x do not match those in y') # reindex to add contexts for duplicate named columns # (i.e. Term Accession Number, Unit, etc.) newcolsx = list() for col in df_x.columns: newcolsx.append(col) for i, col in enumerate(df_x.columns): if any(RX.match(col) for RX in ( _RX_CHARACTERISTICS, _RX_PARAM_VALUE, _RX_FACTOR_VALUE)): try: if 'Unit' in df_x.columns[i + 1]: newcolsx[i + 1] = col + '/Unit' if 'Term Source REF' in df_x.columns[i + 2]: newcolsx[i + 2] = col + '/Unit/Term Source REF' if 'Term Accession Number' in df_x.columns[i + 3]: newcolsx[i + 3] = col + \ '/Unit/Term Accession Number' elif 'Term Source REF' in df_x.columns[i + 1]: newcolsx[i + 1] = col + '/Term Source REF' if 'Term Accession Number' in df_x.columns[i + 2]: newcolsx[i + 2] = col + \ '/Term Accession Number' except IndexError: pass df_x.columns = newcolsx newcolsy = list() for col in df_y.columns: newcolsy.append(col) for i, col in enumerate(df_y.columns): if any(RX.match(col) for RX in ( _RX_CHARACTERISTICS, _RX_PARAM_VALUE, _RX_FACTOR_VALUE)): try: if 'Unit' in df_y.columns[i + 1]: newcolsy[i + 1] = col + '/Unit' if 'Term Source REF' in df_y.columns[i + 2]: newcolsy[i + 2] = col + '/Unit/Term Source REF' if 'Term Accession Number' in df_y.columns[i + 3]: newcolsy[i + 3] = col + \ '/Unit/Term Accession Number' elif 'Term Source REF' in df_y.columns[i + 1]: newcolsy[i + 1] = col + '/Term Source REF' if 'Term Accession Number' in df_y.columns[i + 2]: newcolsy[i + 2] = col + \ '/Term Accession Number' except IndexError: pass df_y.columns = newcolsy for colx in df_x.columns: for eachx, eachy in zip(df_x.sort_values(by=colx)[colx], df_y.sort_values(by=colx)[colx]): if eachx != eachy: log.debug(df_x[colx]) log.debug(df_y[colx]) raise AssertionError('Value: ' + str(eachx) + ', does not match: ' + str(eachy)) return True except AssertionError as e: log.error(str(e)) return False