def parse(self): stream = _get_xml_iter(self.source) it = iterparse(stream) dispatcher = { '{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge, '{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions, '{%s}row' % SHEET_MAIN_NS: self.parse_row_dimensions, '{%s}printOptions' % SHEET_MAIN_NS: self.parse_print_options, '{%s}pageMargins' % SHEET_MAIN_NS: self.parse_margins, '{%s}pageSetup' % SHEET_MAIN_NS: self.parse_page_setup, '{%s}headerFooter' % SHEET_MAIN_NS: self.parse_header_footer, '{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting, '{%s}autoFilter' % SHEET_MAIN_NS: self.parse_auto_filter } tags = dispatcher.keys() stream = _get_xml_iter(self.source) it = iterparse(stream, tag=tags) for event, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() # Handle parsed conditional formatting rules together. if len(self.ws.conditional_formatting.parse_rules): self.ws.conditional_formatting.update(self.ws.conditional_formatting.parse_rules)
def parse(self): dispatcher = { "{%s}mergeCells" % SHEET_MAIN_NS: self.parse_merge, "{%s}col" % SHEET_MAIN_NS: self.parse_column_dimensions, "{%s}row" % SHEET_MAIN_NS: self.parse_row_dimensions, "{%s}printOptions" % SHEET_MAIN_NS: self.parse_print_options, "{%s}pageMargins" % SHEET_MAIN_NS: self.parse_margins, "{%s}pageSetup" % SHEET_MAIN_NS: self.parse_page_setup, "{%s}headerFooter" % SHEET_MAIN_NS: self.parse_header_footer, "{%s}conditionalFormatting" % SHEET_MAIN_NS: self.parser_conditional_formatting, "{%s}autoFilter" % SHEET_MAIN_NS: self.parse_auto_filter, "{%s}sheetProtection" % SHEET_MAIN_NS: self.parse_sheet_protection, "{%s}dataValidations" % SHEET_MAIN_NS: self.parse_data_validation, "{%s}sheetPr" % SHEET_MAIN_NS: self.parse_properties, "{%s}legacyDrawing" % SHEET_MAIN_NS: self.parse_legacy_drawing, } tags = dispatcher.keys() stream = _get_xml_iter(self.source) it = iterparse(stream, tag=tags) for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() # Handle parsed conditional formatting rules together. if len(self.ws.conditional_formatting.parse_rules): self.ws.conditional_formatting.update(self.ws.conditional_formatting.parse_rules)
def get_squared_range(self, min_col, min_row, max_col, max_row): """ The source worksheet file may have columns or rows missing. Missing cells will be created. """ if max_col is not None: empty_row = tuple(EMPTY_CELL for column in range(min_col, max_col + 1)) else: empty_row = [] row_counter = min_row p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) for _event, element in p: if element.tag == ROW_TAG: row_id = int(element.get("r", row_counter)) # got all the rows we need if max_row is not None and row_id > max_row: break # some rows are missing for row_counter in range(row_counter, row_id): row_counter += 1 yield empty_row # return cells from a row if min_row <= row_id: yield tuple(self._get_row(element, min_col, max_col, row_counter=row_counter)) row_counter += 1 element.clear()
def _cells_by_row(self, min_col, min_row, max_col, max_row): """ The source worksheet file may have columns or rows missing. Missing cells will be created. """ if max_col is not None: empty_row = tuple(EMPTY_CELL for column in range(min_col, max_col + 1)) else: empty_row = [] row_counter = min_row if self._iter_parse is None: p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) self._iter_parse = { int(element.get("r", -1)): (_event, element) for _event, element in p if element.tag == ROW_TAG } for row_id in range(min_row, max_row + 1): result_obj = self._iter_parse.get(row_id) # some rows are missing if result_obj is None: yield empty_row _event, element = result_obj yield tuple( self._get_row(element, min_col, max_col, row_counter=row_id))
def _cells_by_row(self, min_col, min_row, max_col, max_row): """ The source worksheet file may have columns or rows missing. Missing cells will be created. """ if max_col is not None: empty_row = tuple(EMPTY_CELL for column in range(min_col, max_col + 1)) else: empty_row = [] row_counter = min_row p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) for _event, element in p: if element.tag == ROW_TAG: row_id = int(element.get("r", row_counter)) # got all the rows we need if max_row is not None and row_id > max_row: break # some rows are missing for row_counter in range(row_counter, row_id): row_counter += 1 yield empty_row # return cells from a row if min_row <= row_id: yield tuple(self._get_row(element, min_col, max_col, row_counter=row_counter)) row_counter += 1 element.clear()
def get_squared_range(self, min_col, min_row, max_col, max_row): """ The source worksheet file may have columns or rows missing. Missing cells will be created. """ if max_col is not None: empty_row = tuple(EMPTY_CELL for column in range(min_col, max_col + 1)) else: empty_row = [] row_counter = min_row p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) for _event, element in p: if element.tag == ROW_TAG: row_id = int(element.get("r")) # got all the rows we need if max_row is not None and row_id > max_row: break # some rows are missing for row_counter in range(row_counter, row_id): row_counter += 1 yield empty_row # return cells from a row if min_row <= row_id: yield tuple(self._get_row(element, min_col, max_col)) row_counter += 1 if element.tag in (CELL_TAG, VALUE_TAG, FORMULA_TAG): # sub-elements of rows should be skipped as handled within a cell continue element.clear()
def parse(self): dispatcher = { '{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge, '{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions, '{%s}row' % SHEET_MAIN_NS: self.parse_row_dimensions, '{%s}printOptions' % SHEET_MAIN_NS: self.parse_print_options, '{%s}pageMargins' % SHEET_MAIN_NS: self.parse_margins, '{%s}pageSetup' % SHEET_MAIN_NS: self.parse_page_setup, '{%s}headerFooter' % SHEET_MAIN_NS: self.parse_header_footer, '{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting, '{%s}autoFilter' % SHEET_MAIN_NS: self.parse_auto_filter, '{%s}sheetProtection' % SHEET_MAIN_NS: self.parse_sheet_protection, '{%s}dataValidations' % SHEET_MAIN_NS: self.parse_data_validation, '{%s}sheetPr' % SHEET_MAIN_NS: self.parse_properties, '{%s}legacyDrawing' % SHEET_MAIN_NS: self.parse_legacy_drawing, '{%s}sheetViews' % SHEET_MAIN_NS: self.parse_sheet_views, } tags = dispatcher.keys() stream = _get_xml_iter(self.source) it = iterparse(stream, tag=tags) for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() self.ws._current_row = self.ws.max_row
def get_cells(self, min_row, min_col, max_row, max_col): p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) for _event, element in p: if element.tag == ROW_TAG: row = int(element.get("r")) if max_row is not None and row > max_row: break if min_row <= row: for cell in safe_iterator(element, CELL_TAG): coord = cell.get('r') column_str, row = coordinate_from_string(coord) column = column_index_from_string(column_str) if max_col is not None and column > max_col: break if min_col <= column: data_type = cell.get('t', 'n') style_id = cell.get('s') formula = cell.findtext(FORMULA_TAG) value = cell.findtext(VALUE_TAG) if formula is not None and not self.parent.data_only: data_type = Cell.TYPE_FORMULA value = "=%s" % formula yield ReadOnlyCell(row, column_str, value, data_type, style_id) if element.tag in (CELL_TAG, VALUE_TAG, FORMULA_TAG): # sub-elements of rows should be skipped continue element.clear()
def read_dimension(source): if hasattr(source, "encode"): return min_row = min_col = max_row = max_col = None DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS it = iterparse(source, tag=[DIMENSION_TAG, DATA_TAG]) for _event, element in it: if element.tag == DIMENSION_TAG: dim = element.get("ref") m = ABSOLUTE_RE.match(dim.upper()) if m is None: return min_col, min_row, sep, max_col, max_row = m.groups() min_row = int(min_row) if max_col is None or max_row is None: max_col = min_col max_row = min_row else: max_row = int(max_row) return min_col, min_row, max_col, max_row elif element.tag == DATA_TAG: # Dimensions missing break element.clear()
def parse(self): dispatcher = { '{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge, '{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions, '{%s}row' % SHEET_MAIN_NS: self.parse_row_dimensions, '{%s}printOptions' % SHEET_MAIN_NS: self.parse_print_options, '{%s}pageMargins' % SHEET_MAIN_NS: self.parse_margins, '{%s}pageSetup' % SHEET_MAIN_NS: self.parse_page_setup, '{%s}headerFooter' % SHEET_MAIN_NS: self.parse_header_footer, '{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting, '{%s}autoFilter' % SHEET_MAIN_NS: self.parse_auto_filter, '{%s}sheetProtection' % SHEET_MAIN_NS: self.parse_sheet_protection, '{%s}dataValidations' % SHEET_MAIN_NS: self.parse_data_validation, '{%s}sheetPr' % SHEET_MAIN_NS: self.parse_properties, '{%s}legacyDrawing' % SHEET_MAIN_NS: self.parse_legacy_drawing, '{%s}sheetViews' % SHEET_MAIN_NS: self.parse_sheet_views, '{%s}extLst' % SHEET_MAIN_NS: self.parse_extensions, } tags = dispatcher.keys() stream = _get_xml_iter(self.source) it = iterparse(stream, tag=tags) for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() self.ws._current_row = self.ws.max_row
def get_squared_range(self, min_col, min_row, max_col, max_row): """ The source worksheet file may have columns or rows missing. Missing cells will be created. """ if max_col is not None: empty_row = tuple(EMPTY_CELL for column in range(min_col, max_col + 1)) else: expected_columns = [] row_counter = min_row p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) for _event, element in p: if element.tag == ROW_TAG: row_id = int(element.get("r")) # got all the rows we need if max_row is not None and row_id > max_row: break # some rows are missing for row_counter in range(row_counter, row_id): yield empty_row # return cells from a row if min_row <= row_id: yield tuple(self._get_row(element, min_col, max_col)) row_counter += 1 if element.tag in (CELL_TAG, VALUE_TAG, FORMULA_TAG): # sub-elements of rows should be skipped as handled within a cell continue element.clear()
def get_cells(self, min_row, min_col, max_row, max_col): p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) for _event, element in p: if element.tag == ROW_TAG: row = int(element.get("r")) if max_row is not None and row > max_row: break if min_row <= row: for cell in safe_iterator(element, CELL_TAG): coord = cell.get('r') column_str, row = coordinate_from_string(coord) column = column_index_from_string(column_str) if max_col is not None and column > max_col: break if min_col <= column: data_type = cell.get('t', 'n') style_id = cell.get('s') formula = cell.findtext(FORMULA_TAG) value = cell.findtext(VALUE_TAG) if formula is not None and not self.parent.data_only: data_type = Cell.TYPE_FORMULA value = "=%s" % formula yield ReadOnlyCell(self, row, column_str, value, data_type, style_id) if element.tag in (CELL_TAG, VALUE_TAG, FORMULA_TAG): # sub-elements of rows should be skipped continue element.clear()
def read_dimension(source): if hasattr(source, "encode"): return min_row = min_col = max_row = max_col = None DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS it = iterparse(source, tag=[DIMENSION_TAG, DATA_TAG]) for _event, element in it: if element.tag == DIMENSION_TAG: dim = element.get("ref") m = ABSOLUTE_RE.match(dim.upper()) if m is None: return min_col, min_row, sep, max_col, max_row = m.groups() min_row = int(min_row) if max_col is None or max_row is None: max_col = min_col max_row = min_row else: max_row = int(max_row) return ( column_index_from_string(min_col), min_row, column_index_from_string(max_col), max_row ) elif element.tag == DATA_TAG: # Dimensions missing break element.clear()
def read_string_table(xml_source): """Read in all shared strings in the table""" strings = [] src = _get_xml_iter(xml_source) for _, node in iterparse(src): if node.tag == '{%s}si' % SHEET_MAIN_NS: text = Text.from_tree(node).content text = text.replace('x005F_', '') strings.append(text) node.clear() return IndexedList(strings)
def parse_dimensions(self): """ Get worksheet dimensions if they are provided. """ it = iterparse(self.source) for _event, element in it: if element.tag == DIMENSION_TAG: dim = SheetDimension.from_tree(element) return dim.boundaries elif element.tag == DATA_TAG: # Dimensions missing break element.clear()
def read_string_table(xml_source): """Read in all shared strings in the table""" strings = [] STRING_TAG = '{%s}si' % SHEET_MAIN_NS for _, node in iterparse(xml_source): if node.tag == STRING_TAG: text = Text.from_tree(node).content text = text.replace('x005F_', '') node.clear() strings.append(text) return strings
def _get_cells(self, min_row, min_col, max_row, max_col): p = iterparse(self.xml_source, tag=[ROW_TAG], remove_blank_text=True) col_counter = min_col for _event, element in p: if element.tag == ROW_TAG: row = int(element.get("r")) if max_row is not None and row > max_row: break if min_row <= row: yield row, tuple(self._get_row(element, min_col, max_col)) if element.tag in (CELL_TAG, VALUE_TAG, FORMULA_TAG): # sub-elements of rows should be skipped continue element.clear()
def parse(self): dispatcher = { '{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge, '{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions, '{%s}row' % SHEET_MAIN_NS: self.parse_row, '{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting, '{%s}legacyDrawing' % SHEET_MAIN_NS: self.parse_legacy_drawing, '{%s}sheetProtection' % SHEET_MAIN_NS: self.parse_sheet_protection, '{%s}extLst' % SHEET_MAIN_NS: self.parse_extensions, '{%s}hyperlink' % SHEET_MAIN_NS: self.parse_hyperlinks, '{%s}tableParts' % SHEET_MAIN_NS: self.parse_tables, } properties = { '{%s}printOptions' % SHEET_MAIN_NS: ('print_options', PrintOptions), '{%s}pageMargins' % SHEET_MAIN_NS: ('page_margins', PageMargins), '{%s}pageSetup' % SHEET_MAIN_NS: ('page_setup', PrintPageSetup), '{%s}headerFooter' % SHEET_MAIN_NS: ('HeaderFooter', HeaderFooter), '{%s}autoFilter' % SHEET_MAIN_NS: ('auto_filter', AutoFilter), '{%s}dataValidations' % SHEET_MAIN_NS: ('data_validations', DataValidationList), #'{%s}sheet/{%s}sortState' % (SHEET_MAIN_NS, SHEET_MAIN_NS): ('sort_state', SortState), '{%s}sheetPr' % SHEET_MAIN_NS: ('sheet_properties', WorksheetProperties), '{%s}sheetViews' % SHEET_MAIN_NS: ('views', SheetViewList), '{%s}sheetFormatPr' % SHEET_MAIN_NS: ('sheet_format', SheetFormatProperties), '{%s}rowBreaks' % SHEET_MAIN_NS: ('page_breaks', PageBreak), } tags = dispatcher.keys() stream = _get_xml_iter(self.source) it = iterparse(stream, tag=tags) for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() elif tag_name in properties: prop = properties[tag_name] obj = prop[1].from_tree(element) setattr(self.ws, prop[0], obj) element.clear() self.ws._current_row = self.ws.max_row
def parse(self): dispatcher = { COL_TAG: self.parse_column_dimensions, PROT_TAG: self.parse_sheet_protection, EXT_TAG: self.parse_extensions, CF_TAG: self.parse_formatting, LEGACY_TAG: self.parse_legacy, ROW_BREAK_TAG: self.parse_row_breaks, COL_BREAK_TAG: self.parse_col_breaks, CUSTOM_VIEWS_TAG: self.parse_custom_views, } properties = { PRINT_TAG: ('print_options', PrintOptions), MARGINS_TAG: ('page_margins', PageMargins), PAGE_TAG: ('page_setup', PrintPageSetup), HEADER_TAG: ('HeaderFooter', HeaderFooter), FILTER_TAG: ('auto_filter', AutoFilter), VALIDATION_TAG: ('data_validations', DataValidationList), PROPERTIES_TAG: ('sheet_properties', WorksheetProperties), VIEWS_TAG: ('views', SheetViewList), FORMAT_TAG: ('sheet_format', SheetFormatProperties), SCENARIOS_TAG: ('scenarios', ScenarioList), TABLE_TAG: ('tables', TablePartList), HYPERLINK_TAG: ('hyperlinks', HyperlinkList), MERGE_TAG: ('merged_cells', MergeCells), } it = iterparse( self.source ) # add a finaliser to close the source when this becomes possible for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() elif tag_name in properties: prop = properties[tag_name] obj = prop[1].from_tree(element) setattr(self, prop[0], obj) element.clear() elif tag_name == ROW_TAG: row = self.parse_row(element) element.clear() yield row
def read_dimension(source): if hasattr(source, "encode"): return min_row = min_col = max_row = max_col = None DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS it = iterparse(source, tag=[DIMENSION_TAG, DATA_TAG]) for _event, element in it: if element.tag == DIMENSION_TAG: dim = SheetDimension.from_tree(element) return dim.boundaries elif element.tag == DATA_TAG: # Dimensions missing break element.clear()
def read_dimension(source): min_row = min_col = max_row = max_col = None DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS it = iterparse(source, tag=[DIMENSION_TAG, DATA_TAG]) for _event, element in it: if element.tag == DIMENSION_TAG: dim = element.get("ref") if ':' in dim: start, stop = dim.split(':') else: start = stop = dim min_col, min_row = coordinate_from_string(start) max_col, max_row = coordinate_from_string(stop) return min_col, min_row, max_col, max_row elif element.tag == DATA_TAG: # Dimensions missing break element.clear()
def parse(self): dispatcher = { '{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge, '{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions, '{%s}row' % SHEET_MAIN_NS: self.parse_row, '{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting, '{%s}legacyDrawing' % SHEET_MAIN_NS: self.parse_legacy_drawing, '{%s}sheetProtection' % SHEET_MAIN_NS: self.parse_sheet_protection, '{%s}extLst' % SHEET_MAIN_NS: self.parse_extensions, '{%s}hyperlink' % SHEET_MAIN_NS: self.parse_hyperlinks, '{%s}tableParts' % SHEET_MAIN_NS: self.parse_tables, } properties = { '{%s}printOptions' % SHEET_MAIN_NS: ('print_options', PrintOptions), '{%s}pageMargins' % SHEET_MAIN_NS: ('page_margins', PageMargins), '{%s}pageSetup' % SHEET_MAIN_NS: ('page_setup', PrintPageSetup), '{%s}headerFooter' % SHEET_MAIN_NS: ('HeaderFooter', HeaderFooter), '{%s}autoFilter' % SHEET_MAIN_NS: ('auto_filter', AutoFilter), '{%s}dataValidations' % SHEET_MAIN_NS: ('data_validations', DataValidationList), #'{%s}sheet/{%s}sortState' % (SHEET_MAIN_NS, SHEET_MAIN_NS): ('sort_state', SortState), '{%s}sheetPr' % SHEET_MAIN_NS: ('sheet_properties', WorksheetProperties), '{%s}sheetViews' % SHEET_MAIN_NS: ('views', SheetViewList), '{%s}sheetFormatPr' % SHEET_MAIN_NS: ('sheet_format', SheetFormatProperties), '{%s}rowBreaks' % SHEET_MAIN_NS: ('page_breaks', PageBreak), } stream = _get_xml_iter(self.source) it = iterparse(stream, tag=dispatcher) for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() elif tag_name in properties: prop = properties[tag_name] obj = prop[1].from_tree(element) setattr(self.ws, prop[0], obj) element.clear() self.ws._current_row = self.ws.max_row