def test_insert_column(self): x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1) y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1) self.assertEqual( self.easy_table.insert_column(index=1, column=[x, y]), Table.create_from_grid(grid=[[self.a, x, self.b, self.c], [self.d, y, self.e, self.f]])) with self.assertRaises(Exception): self.easy_table.insert_column(index=1, column=[x, y, y])
def _create_table_from_omnipage_xml(self, table_tag: Tag, caption: str, paper_id: str) -> Table: ncol = len(table_tag.find('gridtable').find_all('gridcol')) nrow = len(table_tag.find('gridtable').find_all('gridrow')) cells = [] for cell_tag in table_tag.find_all('cellzone'): # BUILD LIST OF TOKENS tokens = [] for word_tag in cell_tag.find_all('wd'): token = Token(text=word_tag.get_text(strip=True)) tokens.append(token) # BUILD CELL FROM LIST OF TOKENS cell = Cell(tokens=tokens, rowspan=int(cell_tag.get('gridrowtill')) - int(cell_tag.get('gridrowfrom')) + 1, colspan=int(cell_tag.get('gridcoltill')) - int(cell_tag.get('gridcolfrom')) + 1) cells.append(cell) # BUILD TABLE FROM LIST OF CELLS table = Table.create_from_cells(cells=cells, nrow=nrow, ncol=ncol, paper_id=paper_id, page_num=0, caption=caption) return table
def setUp(self): self.cell = Cell(tokens=[ Token(text='hi', bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)), Token(text='bye', bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5)) ], rowspan=1, colspan=1)
def test_compute_bounding_box(self): table = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='e')], rowspan=1, colspan=1, bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)), Cell(tokens=[Token(text='e')], rowspan=1, colspan=1, bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5)) ], nrow=1, ncol=2, paper_id='abc', page_num=0, caption='hi this is caption') box = table.bounding_box self.assertEqual(box.ll.x, -1.0) self.assertEqual(box.ll.y, -0.5) self.assertEqual(box.ur.x, 2.5) self.assertEqual(box.ur.y, 1.5)
def _create_table_from_tetml(self, table_id: int, table_tag: Tag, paper_id: str, caption: str) -> Table: cells = [] ncol_per_row = [] for i, row_tag in enumerate(table_tag.find_all('row')): ncol_per_row.append(0) for cell_tag in row_tag.find_all('cell'): # BUILD LIST OF TOKENS tokens = [] for word_tag in cell_tag.find_all('word'): word_box_tag = word_tag.find('box') token = Token( text=word_box_tag.get_text(strip=True), # `find_all` gets font per character, # but use `find` because assume font # is constant within same word font=word_box_tag.find('glyph').get('font'), bounding_box=Box(llx=float(word_box_tag.get('llx')), lly=float(word_box_tag.get('lly')), urx=float(word_box_tag.get('urx')), ury=float(word_box_tag.get('ury')))) tokens.append(token) # BUILD CELL FROM LIST OF TOKENS cell = Cell( tokens=tokens, rowspan=1, colspan=int(cell_tag.get('colspan')) \ if cell_tag.get('colspan') else 1 ) cells.append(cell) ncol_per_row[i] += cell.colspan # TODO: add more filters here if necessary if not all([ncol == ncol_per_row[0] for ncol in ncol_per_row]): raise TetmlXMLToTablesParserException( 'Table {} has unequal columns per row. Skipping...'.format( table_id)) # TODO: `page_num` and `paper_id` fields # BUILD TABLE FROM LIST OF CELLS table = Table.create_from_cells(cells=cells, nrow=len(ncol_per_row), ncol=ncol_per_row[0], paper_id=paper_id, page_num=0, caption=caption) return table
def test_compute_metrics(self): pred_table_missing_header = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=3, ncol=3) with self.assertRaises(Exception): compute_metrics(gold_table=self.gold_table, pred_table=pred_table_missing_header) self.assertEqual( cell_level_recall(gold_table=self.gold_table, pred_table=self.pred_table_empty), 0.0) pred_table_permuted_header = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1) ], nrow=4, ncol=3) with self.assertRaises(Exception): compute_metrics(gold_table=self.gold_table, pred_table=pred_table_permuted_header)
def setUp(self): """ gold: subject, header1, header2 x, 1, 2 y, 3, 4 z, 5, 6 """ self.gold_table = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.gold_table_empty = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3) self.pred_table_perfect = self.gold_table self.pred_table_empty = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3) self.pred_table_permute_rows = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.pred_table_extra_rows = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='w')], rowspan=1, colspan=1), Cell(tokens=[Token(text='7')], rowspan=1, colspan=1), Cell(tokens=[Token(text='8')], rowspan=1, colspan=1) ], nrow=5, ncol=3) self.pred_table_missing_rows = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=3, ncol=3) self.pred_table_partial_credit = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1) ], nrow=4, ncol=3)
def test_count_matching_cells(self): self.assertEqual(count_matching_cells( row1=[ Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1) ], row2=[ Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1) ]), 3.0) self.assertEqual(count_matching_cells( row1=[ Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1) ], row2=[ Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1) ]), 1.0)
def test_improper_table(self): # misspecified nrow or ncol with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) ], nrow=2, ncol=1, paper_id='', page_num=0, caption='') with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) ], nrow=1, ncol=2, paper_id='', page_num=0, caption='') # not enough cells to fill out table with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1) ], nrow=2, ncol=2, paper_id='', page_num=0, caption='') with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1) ], nrow=2, ncol=2, paper_id='', page_num=0, caption='') # cell juts out of table boundaries with self.assertRaises(Exception): Table.create_from_cells( cells=[Cell(tokens=[Token(text='a')], rowspan=1, colspan=2)], nrow=1, ncol=1, paper_id='', page_num=0, caption='')
def setUp(self): self.a = Cell(tokens=[Token(text='a')], rowspan=1, colspan=1) self.b = Cell(tokens=[Token(text='b')], rowspan=1, colspan=1) self.c = Cell(tokens=[Token(text='c')], rowspan=1, colspan=1) self.d = Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) self.e = Cell(tokens=[Token(text='e')], rowspan=1, colspan=1) self.f = Cell(tokens=[Token(text='f')], rowspan=1, colspan=1) self.easy_table = Table(caption='hi this is caption') self.easy_table.grid = np.array([[self.a, self.b, self.c], [self.d, self.e, self.f]]) self.hard_table = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='')], rowspan=2, colspan=2), Cell(tokens=[Token(text='C')], rowspan=1, colspan=2), Cell(tokens=[Token(text='C:1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='C:2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R')], rowspan=3, colspan=1), Cell(tokens=[Token(text='R:1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R:2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R:3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='e')], rowspan=1, colspan=1), Cell(tokens=[Token(text='f')], rowspan=1, colspan=1) ], nrow=5, ncol=4, paper_id='abc', page_num=0, caption='hi this is caption')
def setUp(self): self.table_source = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=4, ncol=3)
def setUp(self): self.table_source = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.table_less_header = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=4, ncol=2) self.table_more_header = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=4, ncol=4) self.table_permute_header = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.table_no_header = Table.create_from_cells([ Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=3, ncol=3) self.table_only_header = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3)
def test_aggregate_tables(self): schema_matcher = SchemaMatcher() target_schema = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1), Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1), Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1) ], nrow=2, ncol=3) pred_aggregate_table = schema_matcher.aggregate_tables( pairwise_mappings=[ PairwiseMapping(self.table_source, target_schema, score=-999, column_mappings=[(1, 2), (2, 1)]) ], target_schema=target_schema) gold_aggregate_table = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1) ], nrow=4, ncol=3) print(pred_aggregate_table) print(gold_aggregate_table) self.assertEquals(pred_aggregate_table, gold_aggregate_table)
def setUp(self): self.table_permute_rows = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.table_extra_rows = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='w')], rowspan=1, colspan=1), Cell(tokens=[Token(text='7')], rowspan=1, colspan=1), Cell(tokens=[Token(text='8')], rowspan=1, colspan=1) ], nrow=5, ncol=3) self.table_missing_rows = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=3, ncol=3)
def test_map_tables(self): target_schema_easy = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3) target_schema_less = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=2) target_schema_more = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header0')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=4) target_schema_permuted = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1) ], nrow=1, ncol=3) schema_matcher = ColNameSchemaMatcher() self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_easy), [ PairwiseMapping(self.table_source, target_schema_easy, score=2.0, column_mappings=[(1, 1), (2, 2)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_permuted), [ PairwiseMapping(self.table_source, target_schema_permuted, score=2.0, column_mappings=[(1, 2), (2, 1)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_more), [ PairwiseMapping(self.table_source, target_schema_more, score=2.0, column_mappings=[(1, 2), (2, 3)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_less), [ PairwiseMapping(self.table_source, target_schema_less, score=1.0, column_mappings=[(2, 1)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[ self.table_source, self.table_less_header, self.table_more_header ], target_schema=target_schema_permuted), [ PairwiseMapping(self.table_source, target_schema_permuted, score=2.0, column_mappings=[(1, 2), (2, 1)]), PairwiseMapping(self.table_less_header, target_schema_permuted, score=1.0, column_mappings=[(1, 1)]), PairwiseMapping(self.table_more_header, target_schema_permuted, score=2.0, column_mappings=[(1, 1), (2, 2)]), ])