def _tables_to_pkl(self): cea = pd.read_csv(self._gt_path('CEA'), names=['tab_id', 'col_id', 'row_id', 'entities'], dtype={ 'tab_id': str, 'col_id': int, 'row_id': int, 'entities': str }) cea['entities'] = cea['entities'].apply(str.split) cta_groups = None if os.path.exists(self._gt_path('CTA')): cta = pd.read_csv( self._gt_path('CTA'), names=['tab_id', 'col_id', 'perfect', 'okay'], dtype={ 'tab_id': str, 'col_id': int, 'perfect': str, 'okay': str }, keep_default_na=False) # the "okay" value might be empty cta['perfect'] = cta['perfect'].apply(str.split) cta['okay'] = cta['okay'].apply(str.split) cta_groups = cta.groupby('tab_id') cpa_groups = None if os.path.exists(self._gt_path('CPA')): cpa = pd.read_csv( self._gt_path('CPA'), names=['tab_id', 'source_id', 'target_id', 'properties'], dtype={ 'tab_id': str, 'source_id': int, 'target_id': int, 'properties': str }) cpa['properties'] = cpa['properties'].apply(str.split) cpa_groups = cpa.groupby('tab_id') cea_groups = cea.groupby('tab_id') for tab_id, cea_group in cea_groups: table = Table(tab_id, self.value, self._table_path(tab_id)) table.set_gt_cell_annotations( zip(cea_group['row_id'], cea_group['col_id'], cea_group['entities'])) if cta_groups and tab_id in cta_groups.groups: cta_group = cta_groups.get_group(tab_id) table.set_gt_column_annotations( zip(cta_group['col_id'], cta_group['perfect'], cta_group['okay'])) if cpa_groups and tab_id in cpa_groups.groups: cpa_group = cpa_groups.get_group(tab_id) table.set_gt_property_annotations( zip(cpa_group['source_id'], cpa_group['target_id'], cpa_group['properties'])) pickle.dump( table, open(f"{self._pickle_table_folder_path()}/{table.tab_id}.pkl", 'wb'))
def get_test_dataset(cls, size, from_dataset=None, rand=False): """ Helper method to generate a test dataset on-the-fly. :param size: dimension of the test dataset to create (# cells) :param from_dataset: dataset to sample rows from. Default: Round1 :param rand: True if the rows should be sampled randomly; otherwise, the top ``size`` rows are returned. :return: a Pandas dataframe """ if from_dataset is None: from_dataset = cls.ST19_Round1 cea = pd.read_csv(from_dataset._gt_path('CEA'), names=['tab_id', 'col_id', 'row_id', 'entities'], dtype={ 'tab_id': str, 'col_id': int, 'row_id': int, 'entities': str }) if rand: cea = cea.sample(size).reset_index() else: cea = cea[:size] cta_groups = None if os.path.exists(from_dataset._gt_path('CTA')): cta = pd.read_csv( from_dataset._gt_path('CTA'), names=['tab_id', 'col_id', 'perfect', 'okay'], dtype={ 'tab_id': str, 'col_id': int, 'perfect': str, 'okay': str }, keep_default_na=False) # the "okay" value might be empty cta['perfect'] = cta['perfect'].apply(str.split) cta['okay'] = cta['okay'].apply(str.split) cta_groups = cta.groupby('tab_id') cpa_groups = None if os.path.exists(from_dataset._gt_path('CPA')): cpa = pd.read_csv( from_dataset._gt_path('CPA'), names=['tab_id', 'source_id', 'target_id', 'properties'], dtype={ 'tab_id': str, 'source_id': int, 'target_id': int, 'properties': str }) cpa['properties'] = cpa['properties'].apply(str.split) cpa_groups = cpa.groupby('tab_id') cea_groups = cea.groupby('tab_id') tables = [] for tab_id, cea_group in cea_groups: table = Table(tab_id, f'{from_dataset.value}_test', from_dataset._table_path(tab_id)) table.set_gt_cell_annotations( zip(cea_group['row_id'], cea_group['col_id'], cea_group['entities'])) if cta_groups and tab_id in cta_groups.groups: cta_group = cta_groups.get_group(tab_id) cta_group = cta_group[cta_group['col_id'].isin( cea_group['col_id'].unique())] table.set_gt_column_annotations( zip(cta_group['col_id'], cta_group['perfect'], cta_group['okay'])) if cpa_groups and tab_id in cpa_groups.groups: cpa_group = cpa_groups.get_group(tab_id) cpa_group = cpa_group[ (cpa_group['source_id'].isin(cea_group['col_id'].unique())) & (cpa_group['target_id'].isin( cea_group['col_id'].unique()))] table.set_gt_property_annotations( zip(cpa_group['source_id'], cpa_group['target_id'], cpa_group['properties'])) tables.append(table) tmp = Enum('GTTestEnum', {'%s_TEST_%d' % (from_dataset.name, size): tables }) # create a temp enum setattr(tmp, 'get_tables', lambda x: x.value ) # add the get_df function, that returns the tables setattr(tmp, 'get_table_categories', lambda x: from_dataset.get_table_categories()) setattr(tmp, 'total_tables', lambda x: len(tables)) return list(tmp)[0]