示例#1
0
    def __init__(self, filename=None, excel=None):

        self.eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            # TODO: use a proper interface so we can (eventually) support
            # loading from file (much faster)  Still need to find a good lib.
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.connect()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = logging.getLogger('pycel')

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self._formula_cells_list = None
示例#2
0
    def _get_cell(self, address):
        cell_value = self.cell_map.get(str(address))

        if cell_value is None:
            return ExcelOpxWrapper.RangeData(address, '', None)

        elif isinstance(cell_value, str) and cell_value.startswith('='):
            return ExcelOpxWrapper.RangeData(address, cell_value, None)

        else:
            return ExcelOpxWrapper.RangeData(address, '', cell_value)
示例#3
0
    def __init__(self, filename=None, excel=None, plugins=None, cycles=False):
        """ Build a compiler instance to organize the formula for a workbook

        :param filename: Excel filename to load from (xlsx or `to_file`)
        :param excel: Opened instance of ExcelWrapper
        :param plugins: module paths for plugin lib functions
        :param cycles: workbook has calculation cycles
        """

        self._eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            # TODO: use a proper interface so we can (eventually) support
            # loading from file (much faster)  Still need to find a good lib.
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.connect()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = logging.getLogger('pycel')

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self.conditional_formats = {}
        self._formula_cells_dict = {}
        self._plugin_modules = plugins

        # Setup to be able to evaluate circular references
        self.cycles = cycles
        self.Cell = _CycleCell if cycles else _Cell
        self.evaluate = (self._evaluate_iterative
                         if cycles else self._evaluate_non_iterative)
示例#4
0
    def get_range(self, address):
        cell = self._get_cell(address)

        if not address.is_range:
            return cell

        elif address.is_unbounded_range:
            # this is a unbounded range to range mapping, disassemble
            formula = cell.formula
            assert formula.startswith(REF_START)
            assert formula.endswith(REF_END)
            ref_addr = formula[len(REF_START):-len(REF_END)]
            return self.get_range(AddressRange(ref_addr))

        elif cell.formula:
            return cell

        else:
            # need to map col or row ranges to a specific range
            addresses = address.resolve_range

            cells = [[self._get_cell(addr) for addr in row]
                     for row in addresses]
            values = [[c.values for c in row] for row in cells]

            return ExcelOpxWrapper.RangeData(address, None, values)
示例#5
0
    def __init__(self, filename=None, excel=None, plugins=None):
        """ Build a compiler instance to organize the formula for a workbook

        :param filename: Excel filename to load from (xlsx or `to_file`)
        :param excel: Opened instance of ExcelWrapper
        :param plugins: module paths for plugin lib functions
        """

        self._eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            # TODO: use a proper interface so we can (eventually) support
            # loading from file (much faster)  Still need to find a good lib.
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.connect()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = logging.getLogger('pycel')

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self._formula_cells_dict = {}
        self._plugin_modules = plugins
示例#6
0
class ExcelCompiler:
    """Class responsible for taking an Excel spreadsheet and compiling it
    to an instance that can be serialized to disk, and executed
    independently of excel.
    """

    save_file_extensions = ('pkl', 'pickle', 'yml', 'yaml', 'json')

    def __init__(self, filename=None, excel=None):

        self.eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            # TODO: use a proper interface so we can (eventually) support
            # loading from file (much faster)  Still need to find a good lib.
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.connect()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = logging.getLogger('pycel')

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self._formula_cells_list = None

    def __getstate__(self):
        # code objects are not serializable
        state = dict(self.__dict__)
        for to_remove in 'eval excel log graph_todos range_todos'.split():
            if to_remove in state:  # pragma: no branch
                state[to_remove] = None
        return state

    def __setstate__(self, d):
        self.__dict__.update(d)
        self.log = logging.getLogger('pycel')

    @staticmethod
    def _compute_file_md5_digest(filename):
        if not os.path.exists(filename):
            return None
        else:
            hash_md5 = hashlib.md5()
            with open(filename, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
            return hash_md5.hexdigest()

    @property
    def _compute_excel_file_md5_digest(self):
        return self._compute_file_md5_digest(self.filename)

    @property
    def hash_matches(self):
        current_hash = self._compute_excel_file_md5_digest
        return self._excel_file_md5_digest == current_hash

    @classmethod
    def _filename_has_extension(cls, filename):
        return next((extension for extension in cls.save_file_extensions
                     if filename.endswith(extension)), None)

    def _to_text(self, filename=None, is_json=False):
        """Serialize to a json/yaml file"""
        extra_data = {} if self.extra_data is None else self.extra_data

        def cell_value(a_cell):
            if a_cell.formula and a_cell.formula.python_code:
                return '=' + a_cell.formula.python_code
            else:
                return a_cell.value

        extra_data.update(
            dict(
                excel_hash=self._excel_file_md5_digest,
                cell_map=dict(
                    sorted(((addr, cell_value(cell))
                            for addr, cell in self.cell_map.items()
                            if cell.serialize),
                           key=lambda x: AddressRange(x[0]).sort_key)),
            ))
        if not filename:
            filename = self.filename + ('.json' if is_json else '.yml')

        # hash the current file to see if this function makes any changes
        existing_hash = (self._compute_file_md5_digest(filename)
                         if os.path.exists(filename) else None)

        if not is_json:
            with open(filename, 'w') as f:
                ymlo = YAML()
                ymlo.width = 120
                ymlo.dump(extra_data, f)
        else:
            with open(filename, 'w') as f:
                json.dump(extra_data, f, indent=4)

        del extra_data['cell_map']

        # hash the newfile, return True if it changed, this is only reliable
        # on pythons which have ordered dict (CPython 3.6 & python 3.7+)
        return (existing_hash is None
                or existing_hash != self._compute_file_md5_digest(filename))

    @classmethod
    def _from_text(cls, filename, is_json=False):
        """deserialize from a json/yaml file"""

        if not is_json:
            if not filename.split('.')[-1].startswith('y'):
                filename += '.yml'
        else:
            if not filename.endswith('.json'):  # pragma: no branch
                filename += '.json'

        with open(filename, 'r') as f:
            data = YAML().load(f)

        excel = _CompiledImporter(filename, data)
        excel_compiler = cls(excel=excel)
        excel.compiler = excel_compiler

        def add_line_numbers(cell_addr, line_number):
            formula = excel_compiler.cell_map[cell_addr].formula
            if formula is not None:
                formula.lineno = line_number
                formula.filename = filename

        # populate the cells
        range_todos = []
        for address, python_code in data['cell_map'].items():
            lineno = data['cell_map'].lc.data[address][0] + 1
            address = AddressRange(address)
            if address.is_range:
                range_todos.append((address, lineno))
            else:
                excel_compiler._make_cells(address)
                add_line_numbers(address.address, lineno)

        # populate the ranges and dependant graph
        for address, lineno in range_todos:
            excel_compiler._make_cells(address)
            add_line_numbers(address.address, lineno)

        excel_compiler._process_gen_graph()
        del data['cell_map']

        # process the rest of the data from the file
        excel_compiler._excel_file_md5_digest = data['excel_hash']
        del data['excel_hash']
        excel_compiler.extra_data = data

        # remove "excel" file references for GC
        excel_compiler.excel = None
        return excel_compiler

    def to_file(self, filename=None, file_types=('pkl', 'yml')):
        """ Save the spreadsheet to a file so it can be loaded later w/o excel

        :param filename: filename to save as, defaults to xlsx_name + file_type
        :param file_types: one or more of: pkl, pickle, yml, yaml, json

        If the filename has one of the expected extensions, then this
        parameter is ignored.

        The text file formats (yaml and json) provide the benefits of:
            1. Can `diff` subsequent version of xlsx to monitor changes.
            2. Can "debug" the generated code.
                Since the compiled code is marked with the line number in
                the text file and will be shown by debuggers and stack traces.
            3. The file size on disk is somewhat smaller than pickle files

        The pickle file format provides the benefits of:
            1. Much faster to load (5x to 10x)
            2. ...  (no #2, speed is the thing)
        """

        filename = filename or self.filename
        extension = self._filename_has_extension(filename)
        if extension:
            file_types = (extension, )
        elif isinstance(file_types, str):
            file_types = (file_types, )

        unknown_types = tuple(ft for ft in file_types
                              if ft not in self.save_file_extensions)
        if unknown_types:
            raise ValueError('Unknown file types: {}'.format(
                ' '.join(unknown_types)))

        pickle_extension = next(
            (ft for ft in file_types if ft.startswith('p')), None)
        non_pickle_extension = next(
            (ft for ft in file_types if not ft.startswith('p')), None)
        extra_extensions = tuple(ft for ft in file_types
                                 if ft not in (pickle_extension,
                                               non_pickle_extension))

        if extra_extensions:
            raise ValueError(
                'Only allowed one pickle extension and one text extension. '
                'Extras: {}'.format(extra_extensions))

        is_json = non_pickle_extension and non_pickle_extension[0] == 'j'

        # round trip through yaml/json to strip out junk
        text_name = filename
        if not text_name.endswith(non_pickle_extension or '.yml'):
            text_name += '.' + (non_pickle_extension or 'yml')
        text_changed = self._to_text(text_name, is_json=is_json)

        # save pickle file if requested and has changed
        if pickle_extension:
            if not filename.endswith(pickle_extension):
                filename += '.' + pickle_extension

            if text_changed or not os.path.exists(filename):
                excel_compiler = self._from_text(text_name, is_json=is_json)
                if non_pickle_extension not in file_types:
                    os.unlink(text_name)

                with open(filename, 'wb') as f:
                    pickle.dump(excel_compiler, f)

    @classmethod
    def from_file(cls, filename):
        """ Load the spreadsheet saved by `to_file`

        :param filename: filename to load from, can be xlsx_name
        """

        extension = cls._filename_has_extension(filename) or next(
            (ext for ext in cls.save_file_extensions
             if os.path.exists(filename + '.' + ext)), None)

        if not extension:
            raise ValueError(
                "Unrecognized file type or compiled file not found"
                ": '{}'".format(filename))

        if not filename.endswith(extension):
            filename += '.' + extension

        if extension[0] == 'p':
            with open(filename, 'rb') as f:
                excel_compiler = pickle.load(f)
        else:
            excel_compiler = cls._from_text(filename,
                                            is_json=extension == 'json')

        return excel_compiler

    def export_to_dot(self, filename=None):
        try:
            # test pydot is importable  (optionally installed)
            import pydot  # noqa: F401
        except ImportError:
            raise ImportError("Package 'pydot' is not installed")

        from networkx.drawing.nx_pydot import write_dot
        filename = filename or (self.filename + '.dot')
        write_dot(self.dep_graph, filename)

    def export_to_gexf(self, filename=None):
        from networkx.readwrite.gexf import write_gexf
        filename = filename or (self.filename + '.gexf')
        write_gexf(self.dep_graph, filename)

    def plot_graph(self, layout_type='spring_layout'):
        try:
            # test matplotlib is importable  (optionally installed)
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError("Package 'matplotlib' is not installed")

        pos = getattr(nx, layout_type)(self.dep_graph, iterations=2000)
        nx.draw_networkx_nodes(self.dep_graph, pos)
        nx.draw_networkx_edges(self.dep_graph, pos, arrows=True)
        nx.draw_networkx_labels(self.dep_graph, pos)
        plt.show()

    def set_value(self, address, value):
        """ Set the value of one or more cells or ranges

        :param address: `str`, `AddressRange`, `AddressCell` or a tuple, list
            or an iterable of these three
        :param value: value to set.  This can be a value or a tuple/list
            which matches the shapes needed for the given address/addresses
        """

        if list_like(value):
            value = tuple(flatten(value))
            if list_like(address):
                address = (AddressCell(addr) for addr in flatten(address))
            else:
                address = flatten(AddressRange(address).resolve_range)
            address = tuple(address)
            assert len(address) == len(value)
            for addr, val in zip(address, value):
                self.set_value(addr, val)
            return

        elif address not in self.cell_map:
            address = AddressRange.create(address).address
            assert address in self.cell_map

        cell_or_range = self.cell_map[address]

        if cell_or_range.value != value:  # pragma: no branch
            # need to be able to 'set' an empty cell
            if cell_or_range.value is None:
                cell_or_range.value = value

            # reset the node + its dependencies
            self._reset(cell_or_range)

            # set the value
            cell_or_range.value = value

    def _reset(self, cell):
        if cell.value is None:
            return
        self.log.info("Resetting {}".format(cell.address))
        cell.value = None

        if cell in self.dep_graph:
            for child_cell in self.dep_graph.successors(cell):
                if child_cell.value is not None:
                    self._reset(child_cell)

    def value_tree_str(self, address, indent=0):
        """Generator which returns a formatted dependency graph"""
        cell = self.cell_map[address]
        yield "{}{} = {}".format(" " * indent, address, cell.value)
        for children in sorted(self.dep_graph.predecessors(cell),
                               key=lambda a: a.address.address):
            yield from self.value_tree_str(children.address.address,
                                           indent + 1)

    def recalculate(self):
        """Recalculate all of the known cells"""
        for cell in self.cell_map.values():
            if isinstance(cell, _CellRange) or cell.formula:
                cell.value = None

        for cell in self.cell_map.values():
            if isinstance(cell, _CellRange):
                self._evaluate_range(cell.address.address)
            else:
                self._evaluate(cell.address.address)

    def trim_graph(self, input_addrs, output_addrs):
        """Remove unneeded cells from the graph"""
        input_addrs = tuple(AddressRange(addr).address for addr in input_addrs)
        output_addrs = tuple(AddressRange(addr) for addr in output_addrs)

        # 1) build graph for all needed outputs
        self._gen_graph(output_addrs)

        # 2) walk the dependant tree (from the inputs) and find needed cells
        needed_cells = set()

        def walk_dependents(cell):
            """passed in a _Cell or _CellRange"""
            for child_cell in self.dep_graph.successors(cell):
                child_addr = child_cell.address.address
                if child_addr not in needed_cells:
                    needed_cells.add(child_addr)
                    walk_dependents(child_cell)

        try:
            for addr in input_addrs:
                if addr in self.cell_map:
                    walk_dependents(self.cell_map[addr])
                else:
                    self.log.warning(
                        'Address {} not found in cell_map'.format(addr))
        except nx.exception.NetworkXError as exc:
            if AddressRange(addr) not in output_addrs:
                raise ValueError('{}: which usually means no outputs '
                                 'are dependant on it.'.format(exc))

        # even unconnected output addresses are needed
        for addr in output_addrs:
            needed_cells.add(addr.address)

        # 3) walk the precedent tree (from the output) and trim unneeded cells
        processed_cells = set()

        def walk_precedents(cell):
            for child_address in (a.address for a in cell.needed_addresses):
                if child_address not in processed_cells:  # pragma: no branch
                    processed_cells.add(child_address)
                    child_cell = self.cell_map[child_address]
                    if child_address in needed_cells or ':' in child_address:
                        walk_precedents(child_cell)
                    else:
                        # trim this cell, now we will need only its value
                        needed_cells.add(child_address)
                        child_cell.formula = None
                        self.log.debug('Trimming {}'.format(child_address))

        for addr in output_addrs:
            walk_precedents(self.cell_map[addr.address])

        # 4) check for any buried (not leaf node) inputs
        for addr in input_addrs:
            cell = self.cell_map.get(addr)
            if cell and getattr(cell, 'formula', None):
                self.log.info("{} is not a leaf node".format(addr))

        # 5) remove unneeded cells
        cells_to_remove = tuple(addr for addr in self.cell_map
                                if addr not in needed_cells)
        for addr in cells_to_remove:
            del self.cell_map[addr]

    def validate_calcs(self, output_addrs=None):
        """For each address, calc the value, and verify that it matches

        This is a debugging tool which will show which cells evaluate
        differently than they do for excel.

        :param output_addrs: The cells to evaluate from (defaults to all)
        :return: dict of addresses with good/bad values that failed to verify
        """
        def close_enough(val1, val2):
            import pytest
            if isinstance(val1, (int, float)) and \
                    isinstance(val2, (int, float)):
                return val2 == pytest.approx(val1)
            else:
                return val1 == val2

        Mismatch = collections.namedtuple('Mismatch',
                                          'original calced formula')

        if output_addrs is None:
            to_verify = self._formula_cells
        elif list_like(output_addrs):
            to_verify = [AddressCell(addr) for addr in flatten(output_addrs)]
        else:
            to_verify = [AddressCell(output_addrs)]

        verified = set()
        failed = {}
        while to_verify:
            addr = to_verify.pop()
            try:
                self._gen_graph(addr)
                cell = self.cell_map[addr.address]
                if isinstance(cell, _Cell) and cell.python_code:
                    original_value = cell.value
                    if original_value == str(cell.formula):
                        self.log.debug("No Orig data?: {}: {}".format(
                            addr, cell.value))
                        continue

                    cell.value = None
                    self._evaluate(addr.address)

                    # pragma: no branch
                    if not close_enough(original_value, cell.value):
                        failed.setdefault('mismatch',
                                          {})[str(addr)] = Mismatch(
                                              original_value, cell.value,
                                              cell.formula.base_formula)
                        print('{} mismatch  {} -> {}  {}'.format(
                            addr, original_value, cell.value,
                            cell.formula.base_formula))

                        # do it again to allow easy breakpointing
                        cell.value = None
                        self._evaluate(cell.address.address)

                verified.add(addr)
                for addr in cell.needed_addresses:
                    if addr not in verified:  # pragma: no branch
                        to_verify.append(addr)
            except Exception as exc:
                cell = self.cell_map.get(addr.address, None)
                formula = cell and cell.formula.base_formula
                exc_str = str(exc)
                exc_str_split = exc_str.split('\n')

                if 'has not been implemented' in exc_str:
                    exc_str_key = exc_str.split('has not been implemented')[0]
                    exc_str_key = exc_str_key.strip().rsplit(' ', 1)[1].upper()
                    not_implemented = True

                else:
                    if len(exc_str_split) == 1:
                        exc_str_key = '{}: {}'.format(
                            type(exc).__name__, exc_str)
                    else:
                        exc_str_key = exc_str_split[-2]  # pragma: no cover
                    not_implemented = exc_str_key.startswith(
                        'NotImplementedError: ')

                if not_implemented:
                    failed.setdefault('not-implemented',
                                      {}).setdefault(exc_str_key, []).append(
                                          (str(addr), formula, exc_str))
                else:
                    failed.setdefault('exceptions',
                                      {}).setdefault(exc_str_key, []).append(
                                          (str(addr), formula, exc_str))

        return failed

    @property
    def _formula_cells(self):
        """Iterate all cells and find cells with formulas"""

        if self._formula_cells_list is None:
            self._formula_cells_list = [
                AddressCell.create(cell.coordinate, ws.title)
                for ws in self.excel.workbook for row in ws.iter_rows()
                for cell in row
                if isinstance(getattr(cell, 'value', None), str)
                and cell.value.startswith('=')
            ]
        return self._formula_cells_list

    def _make_cells(self, address):
        """Given an AddressRange or AddressCell generate compiler Cells"""

        # from here don't build cells that are already in the cell_map
        assert address.address not in self.cell_map

        def add_node_to_graph(node):
            self.dep_graph.add_node(node)
            self.dep_graph.node[node]['sheet'] = node.sheet
            self.dep_graph.node[node]['label'] = node.address.coordinate

            # stick in queue to add edges
            self.graph_todos.append(node)

        def build_cell(excel_cell):
            a_cell = _Cell(excel_cell.address,
                           value=excel_cell.values,
                           formula=excel_cell.formulas,
                           excel=self.excel)
            self.cell_map[str(excel_cell.address)] = a_cell
            return a_cell

        def build_range(excel_range):
            a_range = _CellRange(excel_range)
            excel_cells = zip(a_range, flatten(excel_range.formulas),
                              flatten(excel_range.values))

            added_nodes = [a_range]
            for cell_address, f, value in excel_cells:
                assert isinstance(cell_address, AddressCell)
                if str(cell_address) not in self.cell_map:
                    if (f, value) != ('', None):
                        a_cell = _Cell(cell_address,
                                       value=value,
                                       formula=f,
                                       excel=self.excel)
                        self.cell_map[str(cell_address)] = a_cell
                        added_nodes.append(a_cell)

            self.cell_map[str(excel_range.address)] = a_range
            return added_nodes

        excel_data = self.excel.get_range(address)
        if address.is_range:
            if excel_data.address != address:
                # if the actual data returned is not the same as the address
                # given, then use a reference
                self.cell_map[str(address)] = _Cell(address,
                                                    formula=REF_FORMAT.format(
                                                        excel_data.address))

            self.range_todos.append(str(excel_data.address))
            new_nodes = build_range(excel_data)
        else:
            new_nodes = [build_cell(excel_data)]

        for node in new_nodes:
            if isinstance(node, _CellRange) or node.formula:
                # nodes to analyze: only ranges and formulas have precedents
                add_node_to_graph(node)

    def _evaluate_range(self, address):
        """Evaluate a range"""
        cell_range = self.cell_map.get(address)
        if cell_range is None:
            # we don't save the _CellRange values in the text format files
            assert '!' in address, "{} missing sheetname".format(address)
            self._gen_graph(address)
            cell_range = self.cell_map[address]

        if cell_range.value is None:
            self.log.debug("Evaluating: {}".format(cell_range.address))
            addresses = cell_range.addresses

            data = tuple(
                tuple(self._evaluate(addr.address) for addr in row)
                for row in addresses)

            cell_range.value = data

        return cell_range.value

    def _evaluate(self, address):
        """Evaluate a single cell"""
        cell = self.cell_map[address]

        # calculate the cell value for formulas and ranges
        if cell.value is None:
            if isinstance(cell, _CellRange):
                self._evaluate_range(cell.address.address)

            elif cell.python_code:
                self.log.debug("Evaluating: {}, {}".format(
                    cell.address, cell.python_code))
                if self.eval is None:
                    self.eval = ExcelFormula.build_eval_context(
                        self._evaluate, self._evaluate_range, self.log)
                value = self.eval(cell.formula)
                self.log.info("Cell %s evaluated to '%s' (%s)" %
                              (cell.address, value, type(value).__name__))
                cell.value = VALUE_ERROR if list_like(value) else value

        if isinstance(cell.value, AddressRange):
            # If the cell returns a reference, then dereference
            return self._evaluate(str(cell.value))

        return cell.value

    def evaluate(self, address):
        """ evaluate a cell or cells in the spreadsheet

        :param address: str, AddressRange, AddressCell or a tuple or list
            or iterable of these three
        :return: evaluated value/values
        """

        if str(address) not in self.cell_map:
            if list_like(address):
                if not isinstance(address, (tuple, list)):
                    address = tuple(address)

                # process a tuple or list of addresses
                return type(address)(self.evaluate(c) for c in address)

            address = AddressRange.create(address)

            # get the sheet if not specified
            if not address.has_sheet:
                address = AddressRange(
                    address, sheet=self.excel.get_active_sheet_name())

            if address.address not in self.cell_map:
                self._gen_graph(address.address)

        return self._evaluate(str(address))

    def _gen_graph(self, seed, recursed=False):
        """Given a starting point (e.g., A6, or A3:B7) on a particular sheet,
        generate a Spreadsheet instance that captures the logic and control
        flow of the equations.
        """
        if not isinstance(seed, (AddressRange, AddressCell)):
            if isinstance(seed, str):
                seed = AddressRange(seed)
            elif isinstance(seed, collections.Iterable):
                for s in seed:
                    self._gen_graph(s, recursed=True)
                self._process_gen_graph()
                return
            else:
                raise ValueError('Unknown seed: {}'.format(seed))

        # get/set the current sheet
        if not seed.has_sheet:
            seed = AddressRange(seed, sheet=self.excel.get_active_sheet_name())

        if seed.address in self.cell_map:
            # already did this cell/range
            return

        # process the seed
        self._make_cells(seed)

        if not recursed:
            # if not entered to process one cell / cellrange process other work
            self._process_gen_graph()

    def _process_gen_graph(self):

        while self.graph_todos:
            # connect the dependant cells in the graph
            dependant = self.graph_todos.pop()

            self.log.debug("Handling {}".format(dependant.address))

            for precedent_address in dependant.needed_addresses:
                if precedent_address.address not in self.cell_map:
                    self._gen_graph(precedent_address, recursed=True)

                self.dep_graph.add_edge(
                    self.cell_map[precedent_address.address], dependant)

        # calc the values for ranges
        for range_todo in reversed(self.range_todos):
            self._evaluate_range(range_todo)
        self.range_todos = []

        self.log.info("Graph construction done, %s nodes, "
                      "%s edges, %s self.cell_map entries" %
                      (len(self.dep_graph.nodes()), len(
                          self.dep_graph.edges()), len(self.cell_map)))
示例#7
0
    def __init__(self, filename=None, excel=None, plugins=None, cycles=None):
        """ Build a compiler instance to organize the formula for a workbook

        :param filename: Excel filename to load from (xlsx or `to_file`)
        :param excel: Opened instance of ExcelWrapper or openpyxl workbook
        :param plugins: module paths for plugin lib functions
        :param cycles: Override workbook iterative calculation settings
        """

        self._eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            if not isinstance(excel, (ExcelOpxWrapper, _CompiledImporter)):
                excel = ExcelOpxWrapperNoData(excel)
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.load()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = pycel_logger

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self.conditional_formats = {}
        self._formula_cells_dict = {}
        self._plugin_modules = plugins

        # Setup to be able to evaluate circular references
        self.cycles = cycles
        if hasattr(self.excel, 'workbook'):
            if self.excel.workbook.calculation is None:
                wb_cycles = False
            else:
                wb_cycles = bool(self.excel.workbook.calculation.iterate)
            if self.cycles is None:
                self.cycles = wb_cycles
            elif wb_cycles != bool(self.cycles):
                self.log.warning(
                    f"Initialized with cycles: {self.cycles}, while workbook says: {wb_cycles}"
                )

            if self.cycles:
                self.cycles = dict(
                    iterations=self.excel.workbook.calculation.iterateCount,
                    tolerance=self.excel.workbook.calculation.iterateDelta,
                )

        elif self.cycles:
            assert isinstance(self.cycles, dict)
            assert self.cycles.keys() == {'iterations', 'tolerance'}

        self.Cell = _CycleCell if self.cycles else _Cell
        self.evaluate = (self._evaluate_iterative
                         if self.cycles else self._evaluate_non_iterative)
示例#8
0
class ExcelCompiler:
    """Class responsible for taking an Excel spreadsheet and compiling it
    to an instance that can be serialized to disk, and executed
    independently of excel.
    """

    save_file_extensions = ('pkl', 'pickle', 'yml', 'yaml', 'json')

    def __init__(self, filename=None, excel=None, plugins=None, cycles=None):
        """ Build a compiler instance to organize the formula for a workbook

        :param filename: Excel filename to load from (xlsx or `to_file`)
        :param excel: Opened instance of ExcelWrapper or openpyxl workbook
        :param plugins: module paths for plugin lib functions
        :param cycles: Override workbook iterative calculation settings
        """

        self._eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            if not isinstance(excel, (ExcelOpxWrapper, _CompiledImporter)):
                excel = ExcelOpxWrapperNoData(excel)
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.load()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = pycel_logger

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self.conditional_formats = {}
        self._formula_cells_dict = {}
        self._plugin_modules = plugins

        # Setup to be able to evaluate circular references
        self.cycles = cycles
        if hasattr(self.excel, 'workbook'):
            if self.excel.workbook.calculation is None:
                wb_cycles = False
            else:
                wb_cycles = bool(self.excel.workbook.calculation.iterate)
            if self.cycles is None:
                self.cycles = wb_cycles
            elif wb_cycles != bool(self.cycles):
                self.log.warning(
                    f"Initialized with cycles: {self.cycles}, while workbook says: {wb_cycles}"
                )

            if self.cycles:
                self.cycles = dict(
                    iterations=self.excel.workbook.calculation.iterateCount,
                    tolerance=self.excel.workbook.calculation.iterateDelta,
                )

        elif self.cycles:
            assert isinstance(self.cycles, dict)
            assert self.cycles.keys() == {'iterations', 'tolerance'}

        self.Cell = _CycleCell if self.cycles else _Cell
        self.evaluate = (self._evaluate_iterative
                         if self.cycles else self._evaluate_non_iterative)

    def __getstate__(self):
        # code objects are not serializable
        state = dict(self.__dict__)
        to_removes = '_eval excel log graph_todos range_todos ' \
                     'conditional_formats'.split()
        for to_remove in to_removes:
            if to_remove in state:  # pragma: no branch
                state[to_remove] = None
        return state

    def __setstate__(self, d):
        self.__dict__.update(d)
        self.log = pycel_logger

    @staticmethod
    def _compute_file_md5_digest(filename):
        if not os.path.exists(filename):
            return None
        else:
            hash_md5 = hashlib.md5()
            with open(filename, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
            return hash_md5.hexdigest()

    @property
    def _compute_excel_file_md5_digest(self):
        return self._compute_file_md5_digest(self.filename)

    @property
    def hash_matches(self):
        current_hash = self._compute_excel_file_md5_digest
        return self._excel_file_md5_digest == current_hash

    @property
    def eval(self):
        if self._eval is None:
            eval_ctx = ExcelFormula.build_eval_context(
                self._evaluate,
                self._evaluate_range,
                self.log,
                plugins=self._plugin_modules)

            if self.cycles:

                def _eval(cell, cse_array_address=None):
                    cell.start_calcs()
                    return eval_ctx(cell.formula,
                                    cse_array_address=cse_array_address)

            else:

                def _eval(cell, cse_array_address=None):
                    return eval_ctx(cell.formula,
                                    cse_array_address=cse_array_address)

            self._eval = _eval

        return self._eval

    @classmethod
    def _filename_has_extension(cls, filename):
        return next((extension for extension in cls.save_file_extensions
                     if filename.endswith(extension)), None)

    def _to_text(self, filename=None, is_json=False):
        """Serialize to a json/yaml file"""
        extra_data = {} if self.extra_data is None else self.extra_data

        def cell_value(a_cell):
            if a_cell.formula and a_cell.formula.python_code:
                return '=' + a_cell.formula.python_code
            elif isinstance(a_cell.value, np.float64):
                return float(a_cell.value)
            else:
                return a_cell.value

        extra_data.update(
            dict(
                cycles=self.cycles,
                excel_hash=self._excel_file_md5_digest,
                cell_map=dict(
                    sorted(((addr, cell_value(cell))
                            for addr, cell in self.cell_map.items()
                            if cell.serialize),
                           key=lambda x: AddressRange(x[0]).sort_key)),
                # serialize the workbook filename (not the serialization path)
                filename=self.filename,
            ))
        if not filename:
            filename = self.filename + ('.json' if is_json else '.yml')

        # hash the current file to see if this function makes any changes
        existing_hash = (self._compute_file_md5_digest(filename)
                         if os.path.exists(filename) else None)

        if not is_json:
            with open(filename, 'w') as f:
                ymlo = YAML()
                ymlo.width = 120
                ymlo.dump(extra_data, f)
        else:
            with open(filename, 'w') as f:
                json.dump(extra_data, f, indent=4)

        del extra_data['cell_map']

        # hash the newfile, return True if it changed, this is only reliable
        # on pythons which have ordered dict (CPython 3.6 & python 3.7+)
        return (existing_hash is None
                or existing_hash != self._compute_file_md5_digest(filename))

    @classmethod
    def _from_text(cls, filename, is_json=False):
        """deserialize from a json/yaml file"""

        if not is_json:
            if not filename.split('.')[-1].startswith('y'):
                filename += '.yml'
        else:
            if not filename.endswith('.json'):  # pragma: no branch
                filename += '.json'

        with open(filename, 'r') as f:
            data = YAML().load(f)

        excel = _CompiledImporter(filename, data)
        excel_compiler = cls(excel=excel, cycles=data.pop('cycles', False))
        excel.compiler = excel_compiler

        def add_line_numbers(cell_addr, line_number):
            formula = excel_compiler.cell_map[cell_addr].formula
            if formula is not None:
                formula.lineno = line_number
                formula.filename = filename

        # populate the cells
        range_todos = []
        for address, python_code in data['cell_map'].items():
            lineno = data['cell_map'].lc.data[address][0] + 1
            address = AddressRange(address)
            if address.is_range:
                range_todos.append((address, lineno))
            else:
                excel_compiler._make_cells(address)
                add_line_numbers(address.address, lineno)

        # populate the ranges and dependant graph
        for address, lineno in range_todos:
            excel_compiler._make_cells(address)
            add_line_numbers(address.address, lineno)

        excel_compiler._process_gen_graph()
        del data['cell_map']

        # process the rest of the data from the file
        excel_compiler._excel_file_md5_digest = data['excel_hash']
        del data['excel_hash']
        excel_compiler.extra_data = data

        # remove "excel" file references for GC
        excel_compiler.excel = None
        return excel_compiler

    def to_file(self, filename=None, file_types=('pkl', 'yml')):
        """ Save the spreadsheet to a file so it can be loaded later w/o excel

        :param filename: filename to save as, defaults to xlsx_name + file_type
        :param file_types: one or more of: pkl, pickle, yml, yaml, json

        If the filename has one of the expected extensions, then this
        parameter is ignored.

        The text file formats (yaml and json) provide the benefits of:
            1. Can `diff` subsequent version of xlsx to monitor changes.
            2. Can "debug" the generated code.
                Since the compiled code is marked with the line number in
                the text file and will be shown by debuggers and stack traces.
            3. The file size on disk is somewhat smaller than pickle files

        The pickle file format provides the benefits of:
            1. Much faster to load (5x to 10x)
            2. ...  (no #2, speed is the thing)
        """

        filename = filename or self.filename
        extension = self._filename_has_extension(filename)
        if extension:
            file_types = (extension, )
        elif isinstance(file_types, str):
            file_types = (file_types, )

        unknown_types = tuple(ft for ft in file_types
                              if ft not in self.save_file_extensions)
        if unknown_types:
            raise ValueError(f"Unknown file types: {' '.join(unknown_types)}")

        pickle_extension = next(
            (ft for ft in file_types if ft.startswith('p')), None)
        non_pickle_extension = next(
            (ft for ft in file_types if not ft.startswith('p')), None)
        extra_extensions = tuple(ft for ft in file_types
                                 if ft not in (pickle_extension,
                                               non_pickle_extension))

        if extra_extensions:
            raise ValueError(
                'Only allowed one pickle extension and one text extension. '
                f'Extras: {extra_extensions}')

        is_json = non_pickle_extension and non_pickle_extension[0] == 'j'

        # round trip through yaml/json to strip out junk
        text_name = filename
        if not text_name.endswith(non_pickle_extension or '.yml'):
            text_name += '.' + (non_pickle_extension or 'yml')
        text_changed = self._to_text(text_name, is_json=is_json)

        # save pickle file if requested and has changed
        if pickle_extension:
            if not filename.endswith(pickle_extension):
                filename += '.' + pickle_extension

            if text_changed or not os.path.exists(filename):
                excel_compiler = self._from_text(text_name, is_json=is_json)
                if non_pickle_extension not in file_types:
                    os.unlink(text_name)

                with open(filename, 'wb') as f:
                    pickle.dump(excel_compiler, f)

    @classmethod
    def from_file(cls, filename, plugins=None):
        """ Load the spreadsheet saved by `to_file`

        :param filename: filename to load from, can be xlsx_name
        :param plugins: module paths for plugin lib functions
        """

        extension = cls._filename_has_extension(filename) or next(
            (ext for ext in cls.save_file_extensions
             if os.path.exists(filename + '.' + ext)), None)

        if not extension:
            raise ValueError(
                f"Unrecognized file type or compiled file not found: '{filename}'"
            )

        if not filename.endswith(extension):
            filename += '.' + extension

        if extension[0] == 'p':
            with open(filename, 'rb') as f:
                excel_compiler = pickle.load(f)
        else:
            excel_compiler = cls._from_text(filename,
                                            is_json=extension == 'json')

        excel_compiler.excel = _CompiledImporter(
            '', {
                'filename': excel_compiler.filename,
                'cell_map': excel_compiler.cell_map,
            })
        excel_compiler.range_todos = []
        excel_compiler.graph_todos = []

        excel_compiler._plugin_modules = plugins
        return excel_compiler

    def export_to_dot(self, filename=None):
        try:
            # test pydot is importable  (optionally installed)
            import pydot  # noqa: F401
        except ImportError:
            raise ImportError("Package 'pydot' is not installed")

        from networkx.drawing.nx_pydot import write_dot
        filename = filename or (self.filename + '.dot')
        write_dot(self.dep_graph, filename)

    def export_to_gexf(self, filename=None):
        from networkx.readwrite.gexf import write_gexf
        filename = filename or (self.filename + '.gexf')
        write_gexf(self.dep_graph, filename)

    def plot_graph(self, layout_type='spring_layout'):
        try:
            # test matplotlib is importable  (optionally installed)
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError("Package 'matplotlib' is not installed")

        pos = getattr(nx, layout_type)(self.dep_graph, iterations=2000)
        nx.draw_networkx_nodes(self.dep_graph, pos)
        nx.draw_networkx_edges(self.dep_graph, pos, arrows=True)
        nx.draw_networkx_labels(self.dep_graph, pos)
        plt.show()

    def set_value(self, address, value, set_as_range=False):
        """ Set the value of one or more cells or ranges

        :param address: `str`, `AddressRange`, `AddressCell` or a tuple, list
            or an iterable of these three
        :param value: value to set.  This can be a value or a tuple/list
            which matches the shapes needed for the given address/addresses
        :param set_as_range: With a single range address and a list like value,
            set to true to set the entire rnage to the inserted list.
        """

        if list_like(value) and not set_as_range:
            value = tuple(flatten(value))
            if list_like(address):
                address = (AddressCell(addr) for addr in flatten(address))
            else:
                address = flatten(AddressRange(address).resolve_range)
            address = tuple(address)
            assert len(address) == len(value)
            for addr, val in zip(address, value):
                self.set_value(addr, val)
            return

        elif address not in self.cell_map:
            address = AddressRange.create(address).address
            assert address in self.cell_map, (
                f'Address "{address}" not found in the cell map. Evaluate the '
                'address, or an address that references it, to place it in the cell map.'
            )

        if set_as_range and list_like(value) and not (value
                                                      and list_like(value[0])):
            value = (value, )

        cell_or_range = self.cell_map[address]

        if cell_or_range.value != value:  # pragma: no branch
            # need to be able to 'set' an empty cell, set to not None
            cell_or_range.value = value

            # reset the node + its dependencies
            if not self.cycles:
                self._reset(cell_or_range)

            # set the value
            cell_or_range.value = value

    def _reset(self, cell):
        if cell.needs_calc:
            return
        self.log.info(f"Resetting {cell.address}")
        cell.value = None

        if cell in self.dep_graph:
            for child_cell in self.dep_graph.successors(cell):
                if child_cell.value is not None:
                    self._reset(child_cell)

    def value_tree_str(self, address, indent=0):
        iterative_eval_tracker.inc_iteration_number()
        yield from self._value_tree_str(address)

    def _value_tree_str(self, address, indent=0):
        """Generator which returns a formatted dependency graph"""
        cell = self.cell_map[address]

        if iterative_eval_tracker.is_calced(address):
            yield f'{" " * indent}{address} <- cycle'
        else:
            iterative_eval_tracker.calced(address)
            yield f'{" " * indent}{address} = {cell.value}'
            for children in sorted(self.dep_graph.predecessors(cell),
                                   key=lambda a: a.address.address):
                yield from self._value_tree_str(children.address.address,
                                                indent + 1)

    def recalculate(self):
        """Recalculate all of the known cells"""
        for cell in self.cell_map.values():
            if isinstance(cell, _CellRange) or cell.formula:
                cell.value = None

        for cell in self.cell_map.values():
            self.evaluate(cell.address.address)

    def trim_graph(self, input_addrs, output_addrs):
        """Remove unneeded cells from the graph"""
        input_addrs = tuple(AddressRange(addr).address for addr in input_addrs)
        output_addrs = tuple(AddressRange(addr) for addr in output_addrs)

        # 1) build graph for all needed outputs
        self._gen_graph(output_addrs)

        # 2) walk the dependant tree (from the inputs) and find needed cells
        needed_cells = set()

        def walk_dependents(cell):
            """passed in a _Cell or _CellRange"""
            for child_cell in self.dep_graph.successors(cell):
                child_addr = child_cell.address.address
                if child_addr not in needed_cells:
                    needed_cells.add(child_addr)
                    walk_dependents(child_cell)

        missing_dependants = set()
        for addr in input_addrs:
            try:
                if addr in self.cell_map:
                    walk_dependents(self.cell_map[addr])
                    msg = ''
                else:
                    msg = 'warning', f'Address {addr} not found in cell_map'
            except nx.exception.NetworkXError as exc:
                if AddressRange(addr) not in output_addrs:
                    msg = 'error', f'{exc}: which usually means no outputs are dependant on it.'
                else:
                    msg = 'warning', str(exc)
            if msg:
                missing_dependants.add((addr, *msg))
        if missing_dependants:
            for addr, level, msg in missing_dependants:
                getattr(self.log, level)(f"Input address {addr}: {msg}")

        if any(m[1] != 'warning' for m in missing_dependants):
            raise ValueError('\n' + '\n'.join(
                map(str, sorted(missing_dependants, key=lambda x: x[2]))))

        # even unconnected output addresses are needed
        for addr in output_addrs:
            needed_cells.add(addr.address)

        # 3) walk the precedent tree (from the output) and trim unneeded cells
        processed_cells = set()

        def walk_precedents(cell):
            for child_address in (a.address for a in cell.needed_addresses):
                if child_address not in processed_cells:  # pragma: no branch
                    processed_cells.add(child_address)
                    child_cell = self.cell_map[child_address]
                    if child_address in needed_cells or ':' in child_address:
                        walk_precedents(child_cell)
                    else:
                        # trim this cell, now we will need only its value
                        needed_cells.add(child_address)
                        child_cell.formula = None
                        self.log.debug(f'Trimming {child_address}')

        for addr in output_addrs:
            walk_precedents(self.cell_map[addr.address])

        # 4) check for any buried (not leaf node) inputs
        for addr in input_addrs:
            cell = self.cell_map.get(addr)
            if cell and getattr(cell, 'formula', None):
                self.log.info(f"{addr} is not a leaf node")

        # 5) remove unneeded cells
        cells_to_remove = tuple(addr for addr in self.cell_map
                                if addr not in needed_cells)
        for addr in cells_to_remove:
            del self.cell_map[addr]

    def validate_serialized(self, **kwargs):
        assert self.excel, "validate_serialized() needs to be run on the compiler"
        failed = self.validate_calcs(**kwargs)
        if failed != {}:
            return failed

        self.to_file(file_types='json')
        compiled = ExcelCompiler.from_file(self.filename + '.json')
        compiled.recalculate()
        for addr in self.cell_map:
            if not self.cell_map[addr].close_enough(
                    compiled.cell_map[addr].value,
                    tol=kwargs.get('tolerance')):
                failed[addr] = Mismatch(self.cell_map[addr].value,
                                        compiled.cell_map[addr].value,
                                        str(self.cell_map[addr].formula))
                print('{} mismatch  {} -> {}  {}'.format(
                    addr, self.cell_map[addr].value, self.cell_map[addr].value,
                    str(self.cell_map[addr].formula)))
        return failed

    def validate_calcs(self,
                       output_addrs=None,
                       sheet=None,
                       verify_tree=True,
                       tolerance=None,
                       raise_exceptions=False):
        """For each address, calc the value, and verify that it matches

        This is a debugging tool which will show which cells evaluate
        differently than they do for excel.

        :param output_addrs: The cells to evaluate from (defaults to all)
        :param sheet: The sheet to evaluate from (defaults to all)
        :param verify_tree: Follow the tree to any precedent nodes
        :return: dict of addresses with good/bad values that failed to verify
        """
        if output_addrs is None:
            to_verify = list(self.formula_cells(sheet))
            print(f'Found {len(to_verify)} formulas to evaluate')
        elif list_like(output_addrs):
            to_verify = [AddressCell(addr) for addr in flatten(output_addrs)]
        else:
            to_verify = [AddressCell(output_addrs)]

        verified = set()
        failed = {}
        if self.cycles:
            iterative_eval_tracker(**self.cycles)
        while to_verify:
            addr = to_verify.pop()
            if len(to_verify) % 100 == 0:
                print(f"{len(to_verify)} formulas left to process")
            try:
                self._gen_graph(addr)
                cell = self.cell_map[addr.address]
                if isinstance(cell, _Cell) and cell.python_code and (
                        not cell.address.is_unbounded_range):
                    original_value = cell.value
                    if original_value == str(cell.formula):
                        self.log.debug(f"No Orig data?: {addr}: {cell.value}")
                        continue

                    cell.value = None
                    self.evaluate(addr.address)

                    if not (original_value is None or cell.close_enough(
                            original_value, tol=tolerance)):
                        failed.setdefault('mismatch',
                                          {})[str(addr)] = Mismatch(
                                              original_value, cell.value,
                                              cell.formula.base_formula)
                        print('{} mismatch  {} -> {}  {}'.format(
                            addr, original_value, cell.value,
                            cell.formula.base_formula))

                        # do it again to allow easy break-pointing
                        cell.value = None
                        self.evaluate(cell.address.address)

                verified.add(addr)
                if verify_tree:  # pragma: no branch
                    for addr in cell.needed_addresses:
                        if addr not in verified:  # pragma: no branch
                            to_verify.append(addr)
            except Exception as exc:
                if raise_exceptions:
                    raise
                cell = self.cell_map.get(addr.address, None)
                formula = cell and cell.formula.base_formula
                exc_str = str(exc)
                exc_str_split = exc_str.split('\n')

                if 'is not implemented' in exc_str:
                    exc_str_key = exc_str.split('is not implemented')[0]
                    exc_str_key = exc_str_key.strip().rsplit(' ', 1)[1].upper()
                    not_implemented = True

                else:
                    if len(exc_str_split) == 1:
                        exc_str_key = f'{type(exc).__name__}: {exc_str}'
                    else:
                        exc_str_key = exc_str_split[-2]  # pragma: no cover
                    not_implemented = exc_str_key.startswith(
                        'NotImplementedError: ')

                if not_implemented:
                    failed.setdefault('not-implemented',
                                      {}).setdefault(exc_str_key, []).append(
                                          (str(addr), formula, exc_str))
                else:
                    failed.setdefault('exceptions',
                                      {}).setdefault(exc_str_key, []).append(
                                          (str(addr), formula, exc_str))

        return failed

    def formula_cells(self, sheet=None):
        """Iterate all cells and find cells with formulas"""
        if sheet is None:
            return list(
                it.chain.from_iterable(
                    self.formula_cells(sheet.title)
                    for sheet in self.excel.workbook))

        if sheet not in self._formula_cells_dict:
            if sheet in self.excel.workbook:
                self._formula_cells_dict[sheet] = tuple(
                    AddressCell.create(cell.coordinate, sheet)
                    for row in self.excel.workbook[sheet].iter_rows()
                    for cell in row
                    if isinstance(getattr(cell, 'value', None), str)
                    and cell.value.startswith('='))
            else:
                self._formula_cells_dict[sheet] = tuple()

        return self._formula_cells_dict[sheet]

    def _make_cells(self, address):
        """Given an AddressRange or AddressCell generate compiler Cells"""

        # from here don't build cells that are already in the cell_map
        assert address.address not in self.cell_map

        def add_node_to_graph(node):
            self.dep_graph.add_node(node)
            self.dep_graph.nodes[node]['sheet'] = node.sheet
            self.dep_graph.nodes[node]['label'] = node.address.coordinate

            # stick in queue to add edges
            self.graph_todos.append(node)

        def build_cell(excel_cell):
            a_cell = self.Cell(excel_cell.address,
                               value=excel_cell.values,
                               formula=excel_cell.formula,
                               excel=self.excel)
            self.cell_map[str(excel_cell.address)] = a_cell
            return [a_cell]

        def build_range(excel_range):
            a_range = _CellRange(excel_range, excel=self.excel)
            self.cell_map[str(excel_range.address)] = a_range

            added = [a_range]
            if isinstance(excel_range.formula, tuple):
                for addr, value, formula in a_range.cells_to_build(
                        excel_range):
                    if addr.address not in self.cell_map:
                        a_cell = self.Cell(addr, value, formula, self.excel)
                        self.cell_map[addr.address] = a_cell
                        added.append(a_cell)
            else:
                for addr in a_range.needed_addresses:
                    if addr.address not in self.cell_map:
                        self._make_cells(addr)
            return added

        self.log.debug(f'_make_cells: {address}')
        excel_data = self.excel.get_range(address)
        if address.is_range:
            if excel_data.address != address:
                # if the actual data returned is not the same as the address
                # given, then use a reference
                self.cell_map[str(address)] = self.Cell(
                    address,
                    formula=REF_FORMAT.format(excel_data.address),
                    excel=self.excel)

            self.range_todos.append(str(excel_data.address))
            new_nodes = build_range(excel_data)
        else:
            new_nodes = build_cell(excel_data)

        for new_node in new_nodes:
            if isinstance(new_node, _CellRange) or new_node.formula:
                # nodes to analyze: only ranges and formulas have precedents
                add_node_to_graph(new_node)

    def _evaluate_range(self, address):
        """Evaluate a range"""
        if address in ERROR_CODES:
            return address

        cell_range = self.cell_map.get(address)
        if cell_range is None:
            # we don't save the _CellRange values in the text format files
            assert '!' in address, f"{address} missing sheetname"
            self._gen_graph(address)
            cell_range = self.cell_map[address]

        if cell_range.needs_calc:
            self.log.debug(
                f"Evaluating: {cell_range.address}, {cell_range.python_code}")
            if cell_range.address.is_unbounded_range:
                bounded_addr = str(self.eval(cell_range))
                bounded_addr_cell = self.cell_map.get(bounded_addr)
                if bounded_addr_cell.value is None:
                    self._evaluate_range(bounded_addr)
                data = bounded_addr_cell.value

            elif cell_range.formula is None:
                data = tuple(
                    tuple(self._evaluate(addr.address) for addr in row)
                    for row in cell_range.addresses)
            else:
                # CSE Array Formula
                data = self.eval(cell_range, cell_range.address)
            self.log.info(f"Range {cell_range.address} evaluated to '{data}'")

            cell_range.value = data

        return cell_range.value

    def _evaluate(self, address):
        """Evaluate a single cell"""
        if address not in self.cell_map:
            # INDIRECT() and OFFSET() can produce addresses we don't already have loaded
            self._gen_graph(address)
        cell = self.cell_map[address]

        # calculate the cell value for formulas and ranges
        if cell.needs_calc:
            if isinstance(cell, _CellRange) or cell.address.is_unbounded_range:
                self._evaluate_range(cell.address.address)

            elif cell.python_code:
                self.log.debug(f"Evaluating: {address}, {cell.python_code}")
                value = self.eval(cell)
                if is_address(value):
                    # eval produced an address (aka: a reference)
                    if value.is_range:
                        # complain as we are not going to do any spilling
                        self.log.warning(
                            f"Cell {address} evaluated to '{value}',"
                            f" truncating to '{value.start}'")
                        value = value.start
                    else:
                        self.log.info(
                            f"Cell {address} evaluated to address '{value}'")

                    # fetch the value for this cell, if it exists
                    ref_addr = value.address
                    if ref_addr not in self.cell_map and getattr(
                            self, 'excel', None):
                        # INDIRECT() can produce addresses we don't already have loaded
                        self._gen_graph(ref_addr)

                    value = self.cell_map[ref_addr].value
                else:
                    self.log.info(
                        f"Cell {cell.address} evaluated to '{value}' ({type(value).__name__})"
                    )
                cell.value = (value[0][0] if list_like(value[0]) else
                              value[0]) if list_like(value) else value

        return cell.value

    def _evaluate_non_iterative(self, address):
        """ evaluate a cell or cells in the spreadsheet

        :param address: str, AddressRange, AddressCell or a tuple or list
            or iterable of these three
        :return: evaluated value/values
        """
        if str(address) not in self.cell_map:
            if list_like(address):
                if not isinstance(address, (tuple, list)):
                    address = tuple(address)

                # process a tuple or list of addresses
                return type(address)(self._evaluate_non_iterative(c)
                                     for c in address)

            address = AddressRange.create(address)

            # get the sheet if not specified
            if not address.has_sheet:
                address = AddressRange(
                    address, sheet=self.excel.get_active_sheet_name())

            if address.address not in self.cell_map:
                self._gen_graph(address)

        result = self._evaluate(str(address))
        if isinstance(result, tuple):
            # trim excess dimensions
            if len(result[0]) == 1:
                result = tuple(row[0] for row in result)
            if len(result) == 1:
                result = result[0]
        return result

    def _evaluate_iterative(self, address, iterations=None, tolerance=None):
        """ evaluate a cell or cells in a spreadsheet with cycles

        reference: https://support.microsoft.com/en-us/office/
                    8540bd0f-6e97-4483-bcf7-1b49cd50d123

        :param address: str, AddressRange, AddressCell or a tuple or list
            or iterable of these three
        :param iterations: maximum number of iterations to compute. If not
            specified use the value from the workbook.
        :param tolerance: maximum change, if any calculated value changes by
            more than this, another iteration will be performed. If not
            specified use the value from the workbook.
        :return: evaluated value/values
        """

        iterations = iterations or self.cycles['iterations'] or 10000
        tolerance = tolerance or self.cycles['tolerance'] or 0.01

        progress_tracker = iterative_eval_tracker(iterations, tolerance)
        while True:
            progress_tracker.inc_iteration_number()
            results = self._evaluate_non_iterative(address)
            if progress_tracker.done:
                return results

    def _gen_graph(self, seed, recursed=False):
        """Given a starting point (e.g., A6, or A3:B7) on a particular sheet,
        generate a Spreadsheet instance that captures the logic and control
        flow of the equations.
        """
        if not is_address(seed):
            if isinstance(seed, str):
                seed = AddressRange(seed)
            elif isinstance(seed, collections.abc.Iterable):
                for s in seed:
                    self._gen_graph(s, recursed=True)
                self._process_gen_graph()
                return
            else:
                raise ValueError(f'Unknown seed: {seed}')

        # get/set the current sheet
        if not seed.has_sheet:
            seed = AddressRange(seed, sheet=self.excel.get_active_sheet_name())

        if '[' in seed.sheet:
            raise NotImplementedError('Linked SheetNames')

        if seed.address in self.cell_map:
            # already did this cell/range
            return

        # process the seed
        self._make_cells(seed)

        if not recursed:
            # if not entered to process one cell / cellrange process other work
            self._process_gen_graph()

    def _process_gen_graph(self):

        while self.graph_todos:
            # connect the dependant cells in the graph
            dependant = self.graph_todos.pop()

            self.log.debug(f"Handling {dependant.address}")

            for precedent_address in dependant.needed_addresses:
                if precedent_address.address not in self.cell_map:
                    self._gen_graph(precedent_address, recursed=True)

                self.dep_graph.add_edge(
                    self.cell_map[precedent_address.address], dependant)

        # calc the values for ranges
        for range_todo in reversed(self.range_todos):
            self._evaluate_range(range_todo)
        self.range_todos = []

        self.log.info(
            f"Graph construction done, {len(self.dep_graph.nodes())} nodes, "
            f"{len(self.dep_graph.edges())} edges, "
            f"{len(self.cell_map)} self.cell_map entries")

    def eval_conditional_formats(self, address):
        """Evaluate the conditional format (formulas) for a cell or cells

        returns the conditional format id which is the key for the dict:
          ExcelCompiler.conditional_formats

        NOTE: conditional_formats are not saved in the persistent formats.
              If needed they can be hand serialized into "extra_data"

        :param address: str, AddressRange, AddressCell or a tuple or list
            or iterable of these three
        :return: evaluated objects ids
        """
        if list_like(address):
            if not isinstance(address, (tuple, list)):
                address = tuple(address)

            # process a tuple or list of addresses
            return type(address)(self.eval_conditional_formats(c)
                                 for c in address)

        address = AddressRange.create(address)

        # get the sheet if not specified
        if not address.has_sheet:
            address = AddressRange(address,
                                   sheet=self.excel.get_active_sheet_name())

        if address.is_range:
            return tuple(
                tuple(self.eval_conditional_formats(addr) for addr in row)
                for row in address.rows)

        cf_addr = str(address).replace('!', '.cf!')

        if cf_addr not in self.cell_map:
            phony_cell = _Cell(address)
            formats = self.excel.conditional_format(address)
            format_strs = []
            for f in formats:
                excel_formula = ExcelFormula(f.formula, cell=phony_cell)
                python_code = excel_formula.python_code
                format_strs.append(
                    f'({python_code}, {f.dxf_id}, {int(bool(f.stop_if_true))})'
                )
                self.conditional_formats[f.dxf_id] = f.dxf

            python_code = f"=conditional_format_ids({', '.join(format_strs)})"
            a_cell = _Cell(address, formula=python_code)
            self.cell_map[cf_addr] = a_cell
            self._gen_graph(a_cell.formula.needed_addresses)

        return self.eval(self.cell_map[cf_addr])
示例#9
0
class ExcelCompiler:
    """Class responsible for taking an Excel spreadsheet and compiling it
    to an instance that can be serialized to disk, and executed
    independently of excel.
    """

    save_file_extensions = ('pkl', 'pickle', 'yml', 'yaml', 'json')

    def __init__(self, filename=None, excel=None, plugins=None):
        """ Build a compiler instance to organize the formula for a workbook

        :param filename: Excel filename to load from (xlsx or `to_file`)
        :param excel: Opened instance of ExcelWrapper
        :param plugins: module paths for plugin lib functions
        """

        self._eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            # TODO: use a proper interface so we can (eventually) support
            # loading from file (much faster)  Still need to find a good lib.
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.connect()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = logging.getLogger('pycel')

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self._formula_cells_dict = {}
        self._plugin_modules = plugins

    def __getstate__(self):
        # code objects are not serializable
        state = dict(self.__dict__)
        for to_remove in '_eval excel log graph_todos range_todos'.split():
            if to_remove in state:    # pragma: no branch
                state[to_remove] = None
        return state

    def __setstate__(self, d):
        self.__dict__.update(d)
        self.log = logging.getLogger('pycel')

    @staticmethod
    def _compute_file_md5_digest(filename):
        if not os.path.exists(filename):
            return None
        else:
            hash_md5 = hashlib.md5()
            with open(filename, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
            return hash_md5.hexdigest()

    @property
    def _compute_excel_file_md5_digest(self):
        return self._compute_file_md5_digest(self.filename)

    @property
    def hash_matches(self):
        current_hash = self._compute_excel_file_md5_digest
        return self._excel_file_md5_digest == current_hash

    @property
    def eval(self):
        if self._eval is None:
            self._eval = ExcelFormula.build_eval_context(
                self._evaluate, self._evaluate_range,
                self.log, plugins=self._plugin_modules)
        return self._eval

    @classmethod
    def _filename_has_extension(cls, filename):
        return next((extension for extension in cls.save_file_extensions
                     if filename.endswith(extension)), None)

    def _to_text(self, filename=None, is_json=False):
        """Serialize to a json/yaml file"""
        extra_data = {} if self.extra_data is None else self.extra_data

        def cell_value(a_cell):
            if a_cell.formula and a_cell.formula.python_code:
                return '=' + a_cell.formula.python_code
            else:
                return a_cell.value

        extra_data.update(dict(
            excel_hash=self._excel_file_md5_digest,
            cell_map=dict(sorted(
                ((addr, cell_value(cell))
                 for addr, cell in self.cell_map.items() if cell.serialize),
                key=lambda x: AddressRange(x[0]).sort_key
            )),
        ))
        if not filename:
            filename = self.filename + ('.json' if is_json else '.yml')

        # hash the current file to see if this function makes any changes
        existing_hash = (self._compute_file_md5_digest(filename)
                         if os.path.exists(filename) else None)

        if not is_json:
            with open(filename, 'w') as f:
                ymlo = YAML()
                ymlo.width = 120
                ymlo.dump(extra_data, f)
        else:
            with open(filename, 'w') as f:
                json.dump(extra_data, f, indent=4)

        del extra_data['cell_map']

        # hash the newfile, return True if it changed, this is only reliable
        # on pythons which have ordered dict (CPython 3.6 & python 3.7+)
        return (existing_hash is None or
                existing_hash != self._compute_file_md5_digest(filename))

    @classmethod
    def _from_text(cls, filename, is_json=False):
        """deserialize from a json/yaml file"""

        if not is_json:
            if not filename.split('.')[-1].startswith('y'):
                filename += '.yml'
        else:
            if not filename.endswith('.json'):  # pragma: no branch
                filename += '.json'

        with open(filename, 'r') as f:
            data = YAML().load(f)

        excel = _CompiledImporter(filename, data)
        excel_compiler = cls(excel=excel)
        excel.compiler = excel_compiler

        def add_line_numbers(cell_addr, line_number):
            formula = excel_compiler.cell_map[cell_addr].formula
            if formula is not None:
                formula.lineno = line_number
                formula.filename = filename

        # populate the cells
        range_todos = []
        for address, python_code in data['cell_map'].items():
            lineno = data['cell_map'].lc.data[address][0] + 1
            address = AddressRange(address)
            if address.is_range:
                range_todos.append((address, lineno))
            else:
                excel_compiler._make_cells(address)
                add_line_numbers(address.address, lineno)

        # populate the ranges and dependant graph
        for address, lineno in range_todos:
            excel_compiler._make_cells(address)
            add_line_numbers(address.address, lineno)

        excel_compiler._process_gen_graph()
        del data['cell_map']

        # process the rest of the data from the file
        excel_compiler._excel_file_md5_digest = data['excel_hash']
        del data['excel_hash']
        excel_compiler.extra_data = data

        # remove "excel" file references for GC
        excel_compiler.excel = None
        return excel_compiler

    def to_file(self, filename=None, file_types=('pkl', 'yml')):
        """ Save the spreadsheet to a file so it can be loaded later w/o excel

        :param filename: filename to save as, defaults to xlsx_name + file_type
        :param file_types: one or more of: pkl, pickle, yml, yaml, json

        If the filename has one of the expected extensions, then this
        parameter is ignored.

        The text file formats (yaml and json) provide the benefits of:
            1. Can `diff` subsequent version of xlsx to monitor changes.
            2. Can "debug" the generated code.
                Since the compiled code is marked with the line number in
                the text file and will be shown by debuggers and stack traces.
            3. The file size on disk is somewhat smaller than pickle files

        The pickle file format provides the benefits of:
            1. Much faster to load (5x to 10x)
            2. ...  (no #2, speed is the thing)
        """

        filename = filename or self.filename
        extension = self._filename_has_extension(filename)
        if extension:
            file_types = (extension, )
        elif isinstance(file_types, str):
            file_types = (file_types, )

        unknown_types = tuple(ft for ft in file_types
                              if ft not in self.save_file_extensions)
        if unknown_types:
            raise ValueError('Unknown file types: {}'.format(
                ' '.join(unknown_types)))

        pickle_extension = next((ft for ft in file_types
                                 if ft.startswith('p')), None)
        non_pickle_extension = next((ft for ft in file_types
                                     if not ft.startswith('p')), None)
        extra_extensions = tuple(ft for ft in file_types if ft not in (
            pickle_extension, non_pickle_extension))

        if extra_extensions:
            raise ValueError(
                'Only allowed one pickle extension and one text extension. '
                'Extras: {}'.format(extra_extensions))

        is_json = non_pickle_extension and non_pickle_extension[0] == 'j'

        # round trip through yaml/json to strip out junk
        text_name = filename
        if not text_name.endswith(non_pickle_extension or '.yml'):
            text_name += '.' + (non_pickle_extension or 'yml')
        text_changed = self._to_text(text_name, is_json=is_json)

        # save pickle file if requested and has changed
        if pickle_extension:
            if not filename.endswith(pickle_extension):
                filename += '.' + pickle_extension

            if text_changed or not os.path.exists(filename):
                excel_compiler = self._from_text(text_name, is_json=is_json)
                if non_pickle_extension not in file_types:
                    os.unlink(text_name)

                with open(filename, 'wb') as f:
                    pickle.dump(excel_compiler, f)

    @classmethod
    def from_file(cls, filename):
        """ Load the spreadsheet saved by `to_file`

        :param filename: filename to load from, can be xlsx_name
        """

        extension = cls._filename_has_extension(filename) or next(
            (ext for ext in cls.save_file_extensions
             if os.path.exists(filename + '.' + ext)), None)

        if not extension:
            raise ValueError("Unrecognized file type or compiled file not found"
                             ": '{}'".format(filename))

        if not filename.endswith(extension):
            filename += '.' + extension

        if extension[0] == 'p':
            with open(filename, 'rb') as f:
                excel_compiler = pickle.load(f)
        else:
            excel_compiler = cls._from_text(
                filename, is_json=extension == 'json')

        return excel_compiler

    def export_to_dot(self, filename=None):
        try:
            # test pydot is importable  (optionally installed)
            import pydot  # noqa: F401
        except ImportError:
            raise ImportError("Package 'pydot' is not installed")

        from networkx.drawing.nx_pydot import write_dot
        filename = filename or (self.filename + '.dot')
        write_dot(self.dep_graph, filename)

    def export_to_gexf(self, filename=None):
        from networkx.readwrite.gexf import write_gexf
        filename = filename or (self.filename + '.gexf')
        write_gexf(self.dep_graph, filename)

    def plot_graph(self, layout_type='spring_layout'):
        try:
            # test matplotlib is importable  (optionally installed)
            import matplotlib.pyplot as plt
        except ImportError:
            raise ImportError("Package 'matplotlib' is not installed")

        pos = getattr(nx, layout_type)(self.dep_graph, iterations=2000)
        nx.draw_networkx_nodes(self.dep_graph, pos)
        nx.draw_networkx_edges(self.dep_graph, pos, arrows=True)
        nx.draw_networkx_labels(self.dep_graph, pos)
        plt.show()

    def set_value(self, address, value, set_as_range=False):
        """ Set the value of one or more cells or ranges

        :param address: `str`, `AddressRange`, `AddressCell` or a tuple, list
            or an iterable of these three
        :param value: value to set.  This can be a value or a tuple/list
            which matches the shapes needed for the given address/addresses
        :param set_as_range: With a single range address and a list like value,
            set to true to set the entire rnage to the inserted list.
        """

        if list_like(value) and not set_as_range:
            value = tuple(flatten(value))
            if list_like(address):
                address = (AddressCell(addr) for addr in flatten(address))
            else:
                address = flatten(AddressRange(address).resolve_range)
            address = tuple(address)
            assert len(address) == len(value)
            for addr, val in zip(address, value):
                self.set_value(addr, val)
            return

        elif address not in self.cell_map:
            address = AddressRange.create(address).address
            assert address in self.cell_map

        if set_as_range and list_like(value) and not (
                value and list_like(value[0])):
            value = (value, )

        cell_or_range = self.cell_map[address]

        if cell_or_range.value != value:  # pragma: no branch
            # need to be able to 'set' an empty cell
            if cell_or_range.value is None:
                cell_or_range.value = value

            # reset the node + its dependencies
            self._reset(cell_or_range)

            # set the value
            cell_or_range.value = value

    def _reset(self, cell):
        if cell.value is None:
            return
        self.log.info("Resetting {}".format(cell.address))
        cell.value = None

        if cell in self.dep_graph:
            for child_cell in self.dep_graph.successors(cell):
                if child_cell.value is not None:
                    self._reset(child_cell)

    def value_tree_str(self, address, indent=0):
        """Generator which returns a formatted dependency graph"""
        cell = self.cell_map[address]
        yield "{}{} = {}".format(" " * indent, address, cell.value)
        for children in sorted(self.dep_graph.predecessors(cell),
                               key=lambda a: a.address.address):
            yield from self.value_tree_str(children.address.address, indent + 1)

    def recalculate(self):
        """Recalculate all of the known cells"""
        for cell in self.cell_map.values():
            if isinstance(cell, _CellRange) or cell.formula:
                cell.value = None

        for cell in self.cell_map.values():
            if isinstance(cell, _CellRange):
                self._evaluate_range(cell.address.address)
            else:
                self._evaluate(cell.address.address)

    def trim_graph(self, input_addrs, output_addrs):
        """Remove unneeded cells from the graph"""
        input_addrs = tuple(AddressRange(addr).address for addr in input_addrs)
        output_addrs = tuple(AddressRange(addr) for addr in output_addrs)

        # 1) build graph for all needed outputs
        self._gen_graph(output_addrs)

        # 2) walk the dependant tree (from the inputs) and find needed cells
        needed_cells = set()

        def walk_dependents(cell):
            """passed in a _Cell or _CellRange"""
            for child_cell in self.dep_graph.successors(cell):
                child_addr = child_cell.address.address
                if child_addr not in needed_cells:
                    needed_cells.add(child_addr)
                    walk_dependents(child_cell)

        missing_dependants = set()
        for addr in input_addrs:
            try:
                if addr in self.cell_map:
                    walk_dependents(self.cell_map[addr])
                    msg = ''
                else:
                    msg = ('warning',
                           'Address {} not found in cell_map'.format(addr))
            except nx.exception.NetworkXError as exc:
                if AddressRange(addr) not in output_addrs:
                    msg = 'error', '{}: which usually means no outputs ' \
                                   'are dependant on it.'.format(exc)
                else:
                    msg = 'warning', str(exc)
            if msg:
                missing_dependants.add((addr, *msg))
        if missing_dependants:
            for addr, level, msg in missing_dependants:
                getattr(self.log, level)("Input address {}: {}".format(
                    addr, msg
                ))
        if any(m[1] != 'warning' for m in missing_dependants):
            raise ValueError('\n' + '\n'.join(
                map(str, sorted(missing_dependants, key=lambda x: x[2]))))

        # even unconnected output addresses are needed
        for addr in output_addrs:
            needed_cells.add(addr.address)

        # 3) walk the precedent tree (from the output) and trim unneeded cells
        processed_cells = set()

        def walk_precedents(cell):
            for child_address in (a.address for a in cell.needed_addresses):
                if child_address not in processed_cells:  # pragma: no branch
                    processed_cells.add(child_address)
                    child_cell = self.cell_map[child_address]
                    if child_address in needed_cells or ':' in child_address:
                        walk_precedents(child_cell)
                    else:
                        # trim this cell, now we will need only its value
                        needed_cells.add(child_address)
                        child_cell.formula = None
                        self.log.debug('Trimming {}'.format(child_address))

        for addr in output_addrs:
            walk_precedents(self.cell_map[addr.address])

        # 4) check for any buried (not leaf node) inputs
        for addr in input_addrs:
            cell = self.cell_map.get(addr)
            if cell and getattr(cell, 'formula', None):
                self.log.info("{} is not a leaf node".format(addr))

        # 5) remove unneeded cells
        cells_to_remove = tuple(addr for addr in self.cell_map
                                if addr not in needed_cells)
        for addr in cells_to_remove:
            del self.cell_map[addr]

    def validate_calcs(self, output_addrs=None, sheet=None, verify_tree=True):
        """For each address, calc the value, and verify that it matches

        This is a debugging tool which will show which cells evaluate
        differently than they do for excel.

        :param output_addrs: The cells to evaluate from (defaults to all)
        :param sheet: The sheet to evaluate from (defaults to all)
        :param verify_tree: Follow the tree to any precedent nodes
        :return: dict of addresses with good/bad values that failed to verify
        """
        def close_enough(val1, val2):
            import pytest
            if isinstance(val1, (int, float)) and \
                    isinstance(val2, (int, float)):
                return val2 == pytest.approx(val1)
            else:
                return val1 == val2

        Mismatch = collections.namedtuple('Mismatch', 'original calced formula')

        if output_addrs is None:
            to_verify = list(self.formula_cells(sheet))
            print('Found {} formulas to evaluate'.format(len(to_verify)))
        elif list_like(output_addrs):
            to_verify = [AddressCell(addr) for addr in flatten(output_addrs)]
        else:
            to_verify = [AddressCell(output_addrs)]

        verified = set()
        failed = {}
        while to_verify:
            addr = to_verify.pop()
            if len(to_verify) % 100 == 0:
                print("{} formulas left to process".format(len(to_verify)))
            try:
                self._gen_graph(addr)
                cell = self.cell_map[addr.address]
                if isinstance(cell, _Cell) and cell.python_code:
                    original_value = cell.value
                    if original_value == str(cell.formula):
                        self.log.debug(
                            "No Orig data?: {}: {}".format(addr, cell.value))
                        continue

                    cell.value = None
                    self._evaluate(addr.address)

                    if not (original_value is None or
                            close_enough(original_value, cell.value)):
                        failed.setdefault('mismatch', {})[str(addr)] = Mismatch(
                            original_value, cell.value,
                            cell.formula.base_formula)
                        print('{} mismatch  {} -> {}  {}'.format(
                            addr, original_value, cell.value,
                            cell.formula.base_formula))

                        # do it again to allow easy breakpointing
                        cell.value = None
                        self._evaluate(cell.address.address)

                verified.add(addr)
                if verify_tree:  # pragma: no branch
                    for addr in cell.needed_addresses:
                        if addr not in verified:  # pragma: no branch
                            to_verify.append(addr)
            except Exception as exc:
                cell = self.cell_map.get(addr.address, None)
                formula = cell and cell.formula.base_formula
                exc_str = str(exc)
                exc_str_split = exc_str.split('\n')

                if 'is not implemented' in exc_str:
                    exc_str_key = exc_str.split('is not implemented')[0]
                    exc_str_key = exc_str_key.strip().rsplit(' ', 1)[1].upper()
                    not_implemented = True

                else:
                    if len(exc_str_split) == 1:
                        exc_str_key = '{}: {}'.format(
                            type(exc).__name__, exc_str)
                    else:
                        exc_str_key = exc_str_split[-2]  # pragma: no cover
                    not_implemented = exc_str_key.startswith(
                        'NotImplementedError: ')

                if not_implemented:
                    failed.setdefault('not-implemented', {}).setdefault(
                        exc_str_key, []).append((str(addr), formula, exc_str))
                else:
                    failed.setdefault('exceptions', {}).setdefault(
                        exc_str_key, []).append((str(addr), formula, exc_str))

        return failed

    def formula_cells(self, sheet=None):
        """Iterate all cells and find cells with formulas"""
        if sheet is None:
            return list(it.chain.from_iterable(
                self.formula_cells(sheet.title)
                for sheet in self.excel.workbook))

        if sheet not in self._formula_cells_dict:
            if sheet in self.excel.workbook:
                self._formula_cells_dict[sheet] = tuple(
                    AddressCell.create(cell.coordinate, sheet)
                    for row in self.excel.workbook[sheet].iter_rows()
                    for cell in row
                    if isinstance(getattr(cell, 'value', None), str) and
                    cell.value.startswith('=')
                )
            else:
                self._formula_cells_dict[sheet] = tuple()

        return self._formula_cells_dict[sheet]

    def _make_cells(self, address):
        """Given an AddressRange or AddressCell generate compiler Cells"""

        # from here don't build cells that are already in the cell_map
        assert address.address not in self.cell_map

        def add_node_to_graph(node):
            self.dep_graph.add_node(node)
            self.dep_graph.node[node]['sheet'] = node.sheet
            self.dep_graph.node[node]['label'] = node.address.coordinate

            # stick in queue to add edges
            self.graph_todos.append(node)

        def build_cell(excel_cell):
            a_cell = _Cell(excel_cell.address, value=excel_cell.values,
                           formula=excel_cell.formula, excel=self.excel)
            self.cell_map[str(excel_cell.address)] = a_cell
            return [a_cell]

        def build_range(excel_range):
            a_range = _CellRange(excel_range, excel=self.excel)
            self.cell_map[str(excel_range.address)] = a_range

            added = [a_range]
            if isinstance(excel_range.formula, tuple):
                for addr, value, formula in a_range.cells_to_build(excel_range):
                    if addr.address not in self.cell_map:
                        a_cell = _Cell(addr, value, formula, self.excel)
                        self.cell_map[addr.address] = a_cell
                        added.append(a_cell)
            else:
                for addr in a_range.needed_addresses:
                    if addr.address not in self.cell_map:
                        self._make_cells(addr)
            return added

        self.log.debug('_make_cells: {}'.format(address))
        excel_data = self.excel.get_range(address)
        if address.is_range:
            if excel_data.address != address:
                # if the actual data returned is not the same as the address
                # given, then use a reference
                self.cell_map[str(address)] = _Cell(
                    address, formula=REF_FORMAT.format(excel_data.address),
                    excel=self.excel)

            self.range_todos.append(str(excel_data.address))
            new_nodes = build_range(excel_data)
        else:
            new_nodes = build_cell(excel_data)

        for new_node in new_nodes:
            if isinstance(new_node, _CellRange) or new_node.formula:
                # nodes to analyze: only ranges and formulas have precedents
                add_node_to_graph(new_node)

    def _evaluate_range(self, address):
        """Evaluate a range"""
        if address == 'None':
            return NULL_ERROR

        cell_range = self.cell_map.get(address)
        if cell_range is None:
            # we don't save the _CellRange values in the text format files
            assert '!' in address, "{} missing sheetname".format(address)
            self._gen_graph(address)
            cell_range = self.cell_map[address]

        if cell_range.value is None:
            self.log.debug("Evaluating: {}, {}".format(
                cell_range.address, cell_range.python_code))
            if cell_range.formula is None:
                data = tuple(
                    tuple(self._evaluate(addr.address) for addr in row)
                    for row in cell_range.addresses
                )
            else:
                # CSE Array Formula
                data = self.eval(cell_range.formula, cell_range.address)
            self.log.info("Range %s evaluated to '%s'" % (
                cell_range.address, data))

            cell_range.value = data

        return cell_range.value

    def _evaluate(self, address):
        """Evaluate a single cell"""
        cell = self.cell_map[address]

        # calculate the cell value for formulas and ranges
        if cell.value is None:
            if isinstance(cell, _CellRange):
                self._evaluate_range(cell.address.address)

            elif cell.python_code:
                self.log.debug(
                    "Evaluating: {}, {}".format(cell.address, cell.python_code))
                value = self.eval(cell.formula)
                self.log.info("Cell %s evaluated to '%s' (%s)" % (
                    cell.address, value, type(value).__name__))
                cell.value = VALUE_ERROR if list_like(value) else value

        if isinstance(cell.value, AddressRange):
            # If the cell returns a reference, then dereference
            return self._evaluate(str(cell.value))

        return cell.value

    def evaluate(self, address):
        """ evaluate a cell or cells in the spreadsheet

        :param address: str, AddressRange, AddressCell or a tuple or list
            or iterable of these three
        :return: evaluated value/values
        """

        if str(address) not in self.cell_map:
            if list_like(address):
                if not isinstance(address, (tuple, list)):
                    address = tuple(address)

                # process a tuple or list of addresses
                return type(address)(self.evaluate(c) for c in address)

            address = AddressRange.create(address)

            # get the sheet if not specified
            if not address.has_sheet:
                address = AddressRange(
                    address, sheet=self.excel.get_active_sheet_name())

            if address.address not in self.cell_map:
                self._gen_graph(address.address)

        result = self._evaluate(str(address))
        if isinstance(result, tuple):
            # trim excess dimensions
            if len(result[0]) == 1:
                result = tuple(row[0] for row in result)
            if len(result) == 1:
                result = result[0]
        return result

    def _gen_graph(self, seed, recursed=False):
        """Given a starting point (e.g., A6, or A3:B7) on a particular sheet,
        generate a Spreadsheet instance that captures the logic and control
        flow of the equations.
        """
        if not isinstance(seed, (AddressRange, AddressCell)):
            if isinstance(seed, str):
                seed = AddressRange(seed)
            elif isinstance(seed, collections.abc.Iterable):
                for s in seed:
                    self._gen_graph(s, recursed=True)
                self._process_gen_graph()
                return
            else:
                raise ValueError('Unknown seed: {}'.format(seed))

        # get/set the current sheet
        if not seed.has_sheet:
            seed = AddressRange(seed, sheet=self.excel.get_active_sheet_name())

        if '[' in seed.sheet:
            raise NotImplementedError('Linked SheetNames')

        if seed.address in self.cell_map:
            # already did this cell/range
            return

        # process the seed
        self._make_cells(seed)

        if not recursed:
            # if not entered to process one cell / cellrange process other work
            self._process_gen_graph()

    def _process_gen_graph(self):

        while self.graph_todos:
            # connect the dependant cells in the graph
            dependant = self.graph_todos.pop()

            self.log.debug("Handling {}".format(dependant.address))

            for precedent_address in dependant.needed_addresses:
                if precedent_address.address not in self.cell_map:
                    self._gen_graph(precedent_address, recursed=True)

                self.dep_graph.add_edge(
                    self.cell_map[precedent_address.address], dependant)

        # calc the values for ranges
        for range_todo in reversed(self.range_todos):
            self._evaluate_range(range_todo)
        self.range_todos = []

        self.log.info(
            "Graph construction done, %s nodes, "
            "%s edges, %s self.cell_map entries" % (
                len(self.dep_graph.nodes()),
                len(self.dep_graph.edges()),
                len(self.cell_map))
        )
示例#10
0
    def __init__(self, filename=None, excel=None, plugins=None, cycles=None):
        """ Build a compiler instance to organize the formula for a workbook

        :param filename: Excel filename to load from (xlsx or `to_file`)
        :param excel: Opened instance of ExcelWrapper
        :param plugins: module paths for plugin lib functions
        :param cycles: Override workbook iterative calculation settings
        """

        self._eval = None

        if excel:
            # if we are running as an excel addin, this gets passed to us
            self.excel = excel
            self.filename = excel.filename
            self.hash = None
        else:
            # TODO: use a proper interface so we can (eventually) support
            # loading from file (much faster)  Still need to find a good lib.
            self.excel = ExcelOpxWrapper(filename=filename)
            self.excel.connect()
            self.filename = filename

        # grab a copy of the current hash
        self._excel_file_md5_digest = self._compute_excel_file_md5_digest

        self.log = pycel_logger

        # directed graph for cell dependencies
        self.dep_graph = nx.DiGraph()

        # cell address to Cell mapping, cells and ranges already built
        self.cell_map = {}

        # cells, ranges and graph_edges that need to be built
        self.graph_todos = []
        self.range_todos = []

        self.extra_data = None
        self.conditional_formats = {}
        self._formula_cells_dict = {}
        self._plugin_modules = plugins

        # Setup to be able to evaluate circular references
        self.cycles = cycles
        if hasattr(self.excel, 'workbook'):
            wb_cycles = bool(self.excel.workbook.calculation.iterate)
            if self.cycles is None:
                self.cycles = wb_cycles
            elif wb_cycles != bool(self.cycles):
                msg = "Initialized with cycles: {}, while workbook says: {}"
                self.log.warning(msg.format(self.cycles, wb_cycles))

            if self.cycles:
                self.cycles = dict(
                    iterations=self.excel.workbook.calculation.iterateCount,
                    tolerance=self.excel.workbook.calculation.iterateDelta,
                )

        self.Cell = _CycleCell if self.cycles else _Cell
        self.evaluate = (self._evaluate_iterative
                         if self.cycles else self._evaluate_non_iterative)