def testBlockExtractorV2ForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ]]) sbe = BlockExtractorV2() blocks = sbe.extract_blocks(sheet, tags) HEADER = BlockTypePMF({BasicBlockType.HEADER: 1.0}) VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0}) for block in blocks: print(block) # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order b1 = SimpleBlock(HEADER, 0, 1, 0, 0) b2 = SimpleBlock(HEADER, 0, 0, 1, 3) # Todo: This is not correct b3 = SimpleBlock(VALUE, 1, 1, 1, 3) assert blocks[0] == b1 assert blocks[1] == b2 assert blocks[2] == b3
def testCRFClassificationForSimpleTableWithTwoColumns(self): crf = CRFCellClassifier() values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) tags = crf.classify_cells(Sheet(values, None)) expected_tags = np.array([[ CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ]]) assert np.array_equal(tags, expected_tags)
def testBlockExtractorForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ]]) sbe = ExampleBlockExtractor() blocks = sbe.extract_blocks(sheet, tags) # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order bc = BlockTypePMF({ BasicBlockType.ATTRIBUTE: 0.9, BasicBlockType.HEADER: 0.1, # block_type.EMPTY: 0 }) b1 = SimpleBlock(bc, 0, 1, 0, 3) assert blocks[0] == b1
def testExampleClassificationForSimpleTableWithTwoColumns(self): example = ExampleCellClassifier() values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) tags = example.classify_cells(Sheet(values, None)) print(tags) expected_tags = np.array([ [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})], [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})], [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})], [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})] ]) assert np.array_equal(tags, expected_tags)
def testFeaturizerForMultiplesTables(self): # Table 1 sheet1 = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet1 = Sheet(sheet1, None) tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]]) b1_1 = SimpleBlock("META", 0, 1, 0, 0) b1_2 = SimpleBlock("DATE", 0, 0, 1, 3) b1_3 = SimpleBlock("_DATA_", 1, 1, 1, 3) blocks1 = [b1_1, b1_2, b1_3] # Table 2 sheet2 = np.array([['date', 'value'], ['10.0', '2001'], ['11.0', '2002'], ['12.0', '2003']]) tags2 = np.array([[CellTypePMF('META'), CellTypePMF('META')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')]]) b2_1 = SimpleBlock("META", 0, 1, 0, 0) b2_2 = SimpleBlock("_DATA_", 0, 0, 1, 3) b2_3 = SimpleBlock("DATE", 1, 1, 1, 3) blocks2 = [b2_1, b2_2, b2_3] featurizer = Featurize([sheet1, sheet2], [tags1, tags2], [blocks1, blocks2]) input_features, _ = featurizer.get_input_features() print(input_features) # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]]), ([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])] assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]]), ([[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False]])] layoutGraph1 = LayoutGraph(blocks1) layoutGraph1.add_edge("header", 0, 1) layoutGraph1.add_edge("header", 0, 2) layoutGraph1.add_edge("meta", 1, 2) layoutGraph2 = LayoutGraph(blocks1) layoutGraph2.add_edge("header", 0, 1) layoutGraph2.add_edge("header", 0, 2) layoutGraph2.add_edge("meta", 2, 1) labels = featurizer.get_label_map([layoutGraph1, layoutGraph2]) assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0], [1, 1, 0, 0, 0, 2]])
def testBlockExtractorForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[ CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ]]) sbe = SimpleBlockExtractor() blocks = sbe.extract_blocks(sheet, tags) # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order meta = BlockTypePMF({ BasicBlockType.ATTRIBUTE: 1.0, }) b1 = SimpleBlock("META", 0, 1, 0, 0) b2 = SimpleBlock("DATE", 0, 0, 1, 3) b3 = SimpleBlock("_DATA_", 1, 1, 1, 3) assert blocks[0] == b1 assert blocks[1] == b2 assert blocks[2] == b3
def testFeaturizerForSimpleTableWithTwoColumns(self): sheet = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(sheet, None) tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]]) ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0}) VALUE = BlockTypePMF({block_type.VALUE: 1.0}) b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3) # Todo: This is not correct b3 = SimpleBlock(VALUE, 1, 1, 1, 3) blocks = [b1, b2, b3] featurizer = Featurize([sheet], [tags], [blocks]) input_features, _ = featurizer.get_input_features() print(input_features) # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])] #TODO: FIX THIS? # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]])] layoutGraph = LayoutGraph(blocks) layoutGraph.add_edge(edge_type.HEADER, 0, 1) layoutGraph.add_edge(edge_type.HEADER, 0, 2) layoutGraph.add_edge(edge_type.ATTRIBUTE, 1, 2) labels = featurizer.get_label_map([layoutGraph]) assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0]])
def testLayoutDetectionForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1})], [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})], [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})], [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})]]) ATTRIBUTE = BlockTypePMF({BasicBlockType.ATTRIBUTE: 1.0}) VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0}) b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3) # Todo: This is not correct b3 = SimpleBlock(VALUE, 1, 1, 1, 3) blocks = [b1, b2, b3] sld = ExampleLayoutDetector() layout = sld.detect_layout(sheet, tags, blocks) # TODO: The labels assigned to the edges here are actually wrong. Labels from block b1 should be headers. assert(layout.inEdges == [[], [], [(BasicEdgeType.ATTRIBUTE, 0), (BasicEdgeType.ATTRIBUTE, 1)]]) assert(layout.outEdges == [[(BasicEdgeType.ATTRIBUTE, 2)], [(BasicEdgeType.ATTRIBUTE, 2)], []])
def testCRFEstimator(self): ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0}) VALUE = BlockTypePMF({block_type.VALUE: 1.0}) # Table 1 sheet1 = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet1 = Sheet(sheet1, None) tags1 = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ]]) b1_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b1_2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3) # Todo: This is not correct b1_3 = SimpleBlock(VALUE, 1, 1, 1, 3) blocks1 = [b1_1, b1_2, b1_3] # Table 2 sheet2 = np.array([['date', 'value'], ['10.0', '2001'], ['11.0', '2002'], ['12.0', '2003']]) tags2 = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATA: 1}), CellTypePMF({cell_type.DATE: 1}) ], [ CellTypePMF({cell_type.DATA: 1}), CellTypePMF({cell_type.DATE: 1}) ], [ CellTypePMF({cell_type.DATA: 1}), CellTypePMF({cell_type.DATE: 1}) ]]) b2_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b2_2 = SimpleBlock(VALUE, 0, 0, 1, 3) b2_3 = SimpleBlock(ATTRIBUTE, 1, 1, 1, 3) blocks2 = [b2_1, b2_2, b2_3] layoutGraph1 = LayoutGraph(blocks1) layoutGraph1.add_edge(edge_type.HEADER, 0, 1) layoutGraph1.add_edge(edge_type.HEADER, 0, 2) layoutGraph1.add_edge(edge_type.ATTRIBUTE, 1, 2) layoutGraph2 = LayoutGraph(blocks1) layoutGraph2.add_edge(edge_type.HEADER, 0, 1) layoutGraph2.add_edge(edge_type.HEADER, 0, 2) layoutGraph2.add_edge(edge_type.ATTRIBUTE, 2, 1) estimator = CRFLayoutEstimator() estimator.set_input( [sheet1, sheet2, sheet1, sheet2], [tags1, tags2, tags1, tags2], [blocks1, blocks2, blocks1, blocks2], [layoutGraph1, layoutGraph2, layoutGraph1, layoutGraph2]) crf_layout_detector = estimator.fit_crf()
def get_sheet_by_index(self, idx) -> Sheet: values = self.wb.sheet_by_index(idx).to_array() values = np.array(values) return Sheet(values, None)
def get_sheets(self) -> List[Sheet]: for name in self.wb.to_dict(): values = self.wb[name].to_array() values = np.array(values) yield Sheet(values, None)
def get_sheet_by_index(self, idx) -> Sheet: values = self.wb.sheet_by_index(idx).to_array() values = np.array(values) self.fill_merged_cells(values, self.wb_xlrd.sheet_by_index(idx).merged_cells) return Sheet(values, None)
def get_sheets(self) -> List[Sheet]: for name in self.wb.to_dict(): values = self.wb[name].to_array() values = np.array(values) self.fill_merged_cells(values, self.wb_xlrd.sheet_by_name(name).merged_cells) yield Sheet(values, {'name': name})