def test_split_heading(self): sheet = sheet_from_file('data/schools.xlsx', 6, 6) index_locations = get_index_locations([sheet]) matches = {} sheet_blocks = split_blocks(sheet, matches, index_locations) print(sheet_blocks) self.assertEqual(len(sheet_blocks), 2)
def test_create_blocks(self): sheet = sheet_from_file('data/schools.xlsx', 0, 0) index_locations = get_index_locations([sheet]) matches = {} blocks = split_blocks(sheet, matches, index_locations) gblocks = generalise(blocks) output_blocks = create_blocks(gblocks, matches) self.assertEqual(len(output_blocks), 9)
def test_split_heading_match(self): sheet1 = sheet_from_file('data/schools.xlsx', 6, 6) sheet2 = sheet_from_file('data/schools.xlsx', 7, 7) sheets = [sheet1, sheet2] index_locations = get_index_locations(sheets) matches = match(sheets) sheet_blocks = split_blocks(sheet1, matches, index_locations) print(sheet_blocks) self.assertEqual(len(sheet_blocks), 3)
def test_split_empty(self): sheet = sheet_from_file('data/schools.xlsx', 5, 0) index_map = sheet.index_map matches = {} index_locations = {v: k for k, v in index_map.items()} print(index_locations) blocks = split_blocks(sheet, matches, index_locations) print(blocks) self.assertEqual(len(blocks), 1) self.assertEqual(len(blocks[0].cells), 12)
def test_generalise(self): sheet = sheet_from_file('data/schools.xlsx', 0, 0) index_locations = get_index_locations([sheet]) matches = {} blocks = split_blocks(sheet, matches, index_locations) gblocks = generalise(blocks) print(gblocks) self.assertEqual(len(gblocks), 8) self.assertIsInstance(gblocks[-1], FormulaBlockHorizontal) self.assertEqual( gblocks[-1].dependant_types, {'color_3_0_3999755851924192', 'color_3_0_5999938962981048'}) self.assertIsInstance(gblocks[-2], FormulaBlockVertical)
def test_split_actual_match(self): sheet1 = sheet_from_file('data/schools.xlsx', 0, 0) sheet2 = sheet_from_file('data/schools.xlsx', 1, 1) matches = match([sheet1, sheet2]) index_locations = get_index_locations([sheet1, sheet2]) blocks = split_blocks(sheet1, matches, index_locations) print(matches) print(blocks) self.assertEqual(len(blocks), 11) self.assertEqual(len(blocks[0].cells), 2) self.assertEqual( blocks[1].types, {'italics', 'color_3_0_7999816888943144', 'theme_3', 'bold'}) self.assertEqual(len(blocks[1].cells), 1)
def extract(filesin, fileout=None): sheets = [ sheet_from_file(filein, sheetnr, sheet_counter) for sheet_counter, (filein, sheetnr) in enumerate(filesin) ] index_locations = get_index_locations(sheets) match_tuples = match(sheets) match_sets = _match_sets(match_tuples) lines = [ line for sheet in sheets for line in split_lines(sheet, match_sets, index_locations) ] lines.extend(_match_lines(lines, match_sets, index_locations)) sheet_blocks = [split_blocks(sheet, lines) for sheet in sheets] try: generalised_sheet_blocks = [ generalise(blocks) for blocks in sheet_blocks ] blocks = [ block for blocks in generalised_sheet_blocks for block in blocks ] output_blocks = create_blocks(blocks, match_tuples) assignment = csp(output_blocks, sheets, match_tuples) wb, df = fill_blocks(blocks, output_blocks, assignment) print('done') pandas.options.display.width = 0 print(df) except Exception as error: print('Error') print(error) raise error finally: #print(df) if fileout: for i, blocks in enumerate(sheet_blocks): draw_blocks(blocks, fileout + f'/{i}.svg') if fileout: filen = fileout + '/output.xlsx' #df.to_excel(filen, header=False, index=False) wb.save(filen) copy_prolog_file(fileout) return df
def test_split_blocks(self): im = { (1, 1): ('c1', 'r1'), (2, 1): ('c2', 'r1'), (1, 2): ('c1', 'r2'), (2, 2): ('c2', 'r2'), } cells = [ Cell('a', 'c1', 'r1', 1, 1), Cell('b', 'c2', 'r1', 2, 1), Cell('c', 'c1', 'r2', 1, 2), Cell('d', 'c2', 'r2', 2, 2) ] matches = [] index_locations = {v: k for k, v in im.items()} sheet = Sheet(im, cells) blocks = split_blocks(sheet, matches, index_locations) self.assertEqual(len(blocks), 1) self.assertEqual(len(blocks[0].cells), 4)
def test_csp(self): sheet = sheet_from_file('data/schools.xlsx', 0, 0) index_locations = get_index_locations([sheet]) matches = {} blocks = split_blocks(sheet, matches, index_locations) output_blocks = create_blocks(blocks, matches) assignment = csp(output_blocks, [sheet], matches) expected = { 'i_0_00_001': 8, 'i_0_00_003': 10, 'i_0_00_004': 11, 'i_0_00_006': 13, 'i_0_01_007': 14, 'j_0_00_001': 12, 'j_0_00_002': 13, 'j_0_00_003': 14, 'j_0_00_005': 16, 'j_0_00_006': 17, 'j_0_01_001': 12 } self.assertEqual(assignment, expected)