Exemplo n.º 1
0
 def normalize_table(self, table: Table) -> Table:
     new_cells = []
     for raw_cell in table.cells:
         for i, j in raw_cell.indices:
             new_cells.append(
                 Cell(tokens=raw_cell.tokens,
                      index_topleft_row=i,
                      index_topleft_col=j,
                      rowspan=1,
                      colspan=1))
     return Table(cells=new_cells, nrow=table.nrow, ncol=table.ncol)
Exemplo n.º 2
0
 def _standardize_cell_sizes(self, table: Table) -> Table:
     """Creates new cells for multispan cells"""
     new_cells = []
     for raw_cell in table.cells:
         for i, j in raw_cell.indices:
             new_cell = Cell(tokens=raw_cell.tokens,
                             index_topleft_row=i,
                             index_topleft_col=j,
                             rowspan=1,
                             colspan=1)
             new_cells.append(new_cell)
     return Table(cells=new_cells, nrow=table.nrow, ncol=table.ncol)
Exemplo n.º 3
0
 def _add_empty_subject(self, table: Table) -> Table:
     for cell in table.cells:
         cell.index_topleft_col += 1
     new_grid = np.insert(table.grid,
                          0,
                          values=[
                              Cell(tokens=[],
                                   index_topleft_row=i,
                                   index_topleft_col=0,
                                   rowspan=1,
                                   colspan=1) for i in range(table.nrow)
                          ],
                          axis=1)
     new_table = Table(grid=new_grid)
     return new_table
Exemplo n.º 4
0
 def _add_empty_header(self, table: Table) -> Table:
     for cell in table.cells:
         cell.index_topleft_row += 1
     new_grid = np.insert(table.grid,
                          0,
                          values=[
                              Cell(tokens=[],
                                   index_topleft_row=0,
                                   index_topleft_col=j,
                                   rowspan=1,
                                   colspan=1) for j in range(table.ncol)
                          ],
                          axis=0)
     new_table = Table(grid=new_grid)
     return new_table
Exemplo n.º 5
0
    def predict(self, tables: List[Table], target_schema: List[str]) -> Table:
        schema_table = Table(cells=[
            Cell(tokens=[s],
                 index_topleft_row=0,
                 index_topleft_col=j,
                 rowspan=1,
                 colspan=1) for j, s in enumerate(target_schema)
        ],
                             nrow=1,
                             ncol=len(target_schema))

        # match each table to the schema (order doesnt matter)
        for table in tables:
            score, column_alignments = \
                self.compute_column_alignments_by_column_names(schema_table,
                                                               table)
            schema_table = self.merge_two_tables(
                target=schema_table,
                source=table,
                column_alignments=column_alignments)
        return schema_table
Exemplo n.º 6
0
    def merge_two_tables(self,
                         target: Table,
                         source: Table,
                         column_alignments: List[Tuple[int, int]],
                         pad: str = 'NONE') -> Table:
        """Merge a `source` table into a `target` table based on their
        `column_alignments`, which is a List of Tuple[int, int] that index
        the `target` column and the `source` column, respectively.

        Unaligned target columns are padded."""

        t = np.array([[str(cell) for cell in row] for row in target.grid],
                     dtype=object)
        s = np.array([[str(cell) for cell in row] for row in source.grid[1:]],
                     dtype=object)
        index_t_cols = [i for i, j in column_alignments]
        index_s_cols = [j for i, j in column_alignments]

        new_rows = np.array([], dtype=object).reshape(source.nrow - 1, 0)
        for j in range(target.ncol):
            # target column has a source column alignment
            if j in index_t_cols:
                new_col = s[:, index_s_cols[index_t_cols.index(j)]] \
                    .reshape(source.nrow - 1, 1)
            # padding if target column doesnt have a source column alignment
            else:
                new_col = np.array([[pad]] * (source.nrow - 1), dtype=object)

            new_rows = np.append(new_rows, new_col, axis=1)

        # append rows of permuted source (excluding header) into target
        t = np.append(t, new_rows, axis=0)

        # convert to a table
        new_table = Table(
            grid=[[Cell([cell], i, j) for j, cell in enumerate(row)]
                  for i, row in enumerate(t)])
        return new_table
Exemplo n.º 7
0
def predict_oracle(source_tables: List[Table], gold_table: Table) -> Table:
    # convert tables into numpy arrays for easier management
    # - strip header row & subject col
    # - pad sources w/ Nones s.t. they have at least as many columns as gold
    gold = np.array([[str(cell) for cell in row]
                     for row in gold_table.grid[1:, 1:]],
                    dtype=object)
    sources = []
    for source_table in source_tables:
        s = {
            'subject':
            np.array([str(cell) for cell in source_table.grid[1:, 0]],
                     dtype=object),
            'source':
            np.array([[str(cell) for cell in row]
                      for row in source_table.grid[1:, 1:]],
                     dtype=object)
        }
        n_pad_cols = gold.shape[1] - s['source'].shape[1]
        if n_pad_cols > 0:
            padding = np.empty(shape=[s['source'].shape[0], n_pad_cols],
                               dtype=object)
            s['source'] = np.append(s['source'], padding, axis=1)
        sources.append(s)

    # initialize predicted output
    pred = np.array([[str(cell) for cell in gold_table.grid[0, :]]],
                    dtype=object)

    # continue until every gold row is matched and/or run out of sources
    while gold.shape[0] > 0 and len(sources) > 0:
        #
        # (1) which source table has most similar columns to gold?
        #
        scores = []
        all_column_mappings = []
        for s in sources:
            # represent each column j as a list [ cell_1j, cell_2j, ... ]
            # gold & source can have differing-length columns
            gold_cols = [list(col) for col in zip(*gold)]
            source_cols = [list(col) for col in zip(*s['source'])]

            # align columns between gold & source
            score, column_mappings = compute_best_alignments(
                x=gold_cols,
                y=source_cols,
                sim=lambda gold_col, source_col: len(
                    compute_intersection(x=gold_col, y=source_col)))
            scores.append(score)
            all_column_mappings.append(column_mappings)

        # pick best match among sources & permute its cols to match gold
        # also, pop this source from the list of sources
        index_best_score = np.argmax(scores)
        best_column_mappings = all_column_mappings[index_best_score]
        s = sources.pop(index_best_score)
        permute_source_cols = [
            source_col for gold_col, source_col in best_column_mappings
        ]
        source = s['source'][:, permute_source_cols]
        subject = s['subject']

        #
        # (2) which rows of (col-permuted) source table match best to gold rows?
        #
        # represent each row i as a tuple = ( cell_i1, cell_i2, ..., cell_ik )
        #  where k = ncol(gold)
        gold_rows = [tuple(cell for cell in row) for row in gold]
        source_rows = [tuple(cell for cell in row) for row in source]

        # align rows between gold & source
        # if score is 0, then break because no more matching is possible
        score, row_mappings = compute_best_alignments_with_threshold(
            x=gold_rows,
            y=source_rows,
            sim=lambda gold_row, source_row: sum(
                [g_i == s_i for g_i, s_i in zip(gold_row, source_row)]),
            threshold=0)
        if score == 0:
            break
        index_gold_rows = []
        index_source_rows = []
        for index_gold_row, index_source_row in row_mappings:
            index_gold_rows.append(index_gold_row)
            index_source_rows.append(index_source_row)

        #
        # (3) append matched source rows to pred
        #
        new_rows = [[str(index_best_score) + '__' + subject[i]] +
                    list(source_rows[i]) for i in index_source_rows]
        pred = np.append(pred, new_rows, axis=0)

        #
        # (4) remove gold rows that matched
        #
        gold = np.delete(gold, index_gold_rows, axis=0)

    return Table(
        grid=[[Cell([cell], i, j, 0, 0) for j, cell in enumerate(row)]
              for i, row in enumerate(pred)])
Exemplo n.º 8
0
                    list(source_rows[i]) for i in index_source_rows]
        pred = np.append(pred, new_rows, axis=0)

        #
        # (4) remove gold rows that matched
        #
        gold = np.delete(gold, index_gold_rows, axis=0)

    return Table(
        grid=[[Cell([cell], i, j, 0, 0) for j, cell in enumerate(row)]
              for i, row in enumerate(pred)])


if __name__ == '__main__':
    source_table1 = Table(cells=[
        Cell([''], 0, 0),
        Cell(['x'], 0, 1),
        Cell(['y'], 0, 2),
        Cell(['z'], 0, 3),
        Cell(['s:m1'], 1, 0),
        Cell(['a'], 1, 1),
        Cell(['?'], 1, 2),
        Cell(['2'], 1, 3),
        Cell(['s:m2'], 2, 0),
        Cell(['b'], 2, 1),
        Cell(['?'], 2, 2),
        Cell(['1'], 2, 3),
    ],
                          nrow=3,
                          ncol=4)
Exemplo n.º 9
0
    def setUp(self):
        """
        >     |       |        C       |
        >     |       |   C:1  |  C:2  |
        >  R  |  R:1  |    a   |   b   |
        >  R  |  R:2  |    c   |   d   |
        >  R  |  R:3  |    e   |   f   |
        """
        self.a = Cell(tokens=[''],
                      index_topleft_row=0,
                      index_topleft_col=0,
                      rowspan=2,
                      colspan=2)
        self.b = Cell(tokens=['C'],
                      index_topleft_row=0,
                      index_topleft_col=2,
                      rowspan=1,
                      colspan=2)
        self.c = Cell(tokens=['C:1'],
                      index_topleft_row=1,
                      index_topleft_col=2,
                      rowspan=1,
                      colspan=1)
        self.d = Cell(tokens=['C:2'],
                      index_topleft_row=1,
                      index_topleft_col=3,
                      rowspan=1,
                      colspan=1)
        self.e = Cell(tokens=['R'],
                      index_topleft_row=2,
                      index_topleft_col=0,
                      rowspan=3,
                      colspan=1)
        self.f = Cell(tokens=['R:1'],
                      index_topleft_row=2,
                      index_topleft_col=1,
                      rowspan=1,
                      colspan=1)
        self.g = Cell(tokens=['R:2'],
                      index_topleft_row=3,
                      index_topleft_col=1,
                      rowspan=1,
                      colspan=1)
        self.h = Cell(tokens=['R:3'],
                      index_topleft_row=4,
                      index_topleft_col=1,
                      rowspan=1,
                      colspan=1)
        self.i = Cell(tokens=['a'],
                      index_topleft_row=2,
                      index_topleft_col=2,
                      rowspan=1,
                      colspan=1)
        self.j = Cell(tokens=['b'],
                      index_topleft_row=2,
                      index_topleft_col=3,
                      rowspan=1,
                      colspan=1)
        self.k = Cell(tokens=['c'],
                      index_topleft_row=3,
                      index_topleft_col=2,
                      rowspan=1,
                      colspan=1)
        self.l = Cell(tokens=['d'],
                      index_topleft_row=3,
                      index_topleft_col=3,
                      rowspan=1,
                      colspan=1)
        self.m = Cell(tokens=['e'],
                      index_topleft_row=4,
                      index_topleft_col=2,
                      rowspan=1,
                      colspan=1)
        self.n = Cell(tokens=['f'],
                      index_topleft_row=4,
                      index_topleft_col=3,
                      rowspan=1,
                      colspan=1)

        self.single_cell_table = Table(
            grid=[[self.a, self.a], [self.a, self.a]])

        self.full_table = Table(grid=[[self.a, self.a, self.b, self.b],
                                      [self.a, self.a, self.c, self.d],
                                      [self.e, self.f, self.i, self.j],
                                      [self.e, self.g, self.k, self.l],
                                      [self.e, self.h, self.m, self.n]])
Exemplo n.º 10
0
    def test_improper_table(self):
        # misspecified nrow or ncol raises IndexError
        with self.assertRaises(IndexError):
            Table(cells=[
                Cell(tokens=['a'],
                     index_topleft_row=0,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['b'],
                     index_topleft_row=0,
                     index_topleft_col=1,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['c'],
                     index_topleft_row=1,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['d'],
                     index_topleft_row=1,
                     index_topleft_col=1,
                     rowspan=1,
                     colspan=1)
            ],
                  nrow=2,
                  ncol=1)

        with self.assertRaises(IndexError):
            Table(cells=[
                Cell(tokens=['a'],
                     index_topleft_row=0,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['b'],
                     index_topleft_row=0,
                     index_topleft_col=1,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['c'],
                     index_topleft_row=1,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['d'],
                     index_topleft_row=1,
                     index_topleft_col=1,
                     rowspan=1,
                     colspan=1)
            ],
                  nrow=1,
                  ncol=2)

        # not enough cells to fill out table
        with self.assertRaises(ValueError):
            Table(cells=[
                Cell(tokens=['a'],
                     index_topleft_row=0,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['b'],
                     index_topleft_row=0,
                     index_topleft_col=1,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['c'],
                     index_topleft_row=1,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1)
            ],
                  nrow=2,
                  ncol=2)

        with self.assertRaises(ValueError):
            Table(cells=[
                Cell(tokens=['a'],
                     index_topleft_row=0,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=1),
                Cell(tokens=['b'],
                     index_topleft_row=0,
                     index_topleft_col=1,
                     rowspan=1,
                     colspan=1)
            ],
                  nrow=2,
                  ncol=2)

        # cell protrudes out of table boundaries
        with self.assertRaises(IndexError):
            Table(cells=[
                Cell(tokens=['a'],
                     index_topleft_row=0,
                     index_topleft_col=0,
                     rowspan=1,
                     colspan=2)
            ],
                  nrow=1,
                  ncol=1)
Exemplo n.º 11
0
 def setUp(self):
     self.cell = Cell(tokens=['hi', 'bye'],
                      index_topleft_row=1,
                      index_topleft_col=2,
                      rowspan=2,
                      colspan=2)