Пример #1
    def check_external_relationships(self):
        Check whether this file has external relationships (remote template, OLE object, etc).

        :returns: :py:class:`Indicator`
        ext_rels = Indicator(
            name='External Relationships',
            'External relationships such as remote templates, remote OLE objects, etc',
        # this check only works for OpenXML files
        if not self.ftg.is_openxml():
            return ext_rels
        # to collect relationship types:
        rel_types = set()
        # open an XmlParser, using a BytesIO instead of filename (to work in memory)
        xmlparser = ooxml.XmlParser(self.data_bytesio)
        for rel_type, target in oleobj.find_external_relationships(xmlparser):
            log.debug('External relationship: type={} target={}'.format(
                rel_type, target))
            ext_rels.value += 1
        if ext_rels.value > 0:
            ext_rels.description = 'External relationships found: {} - use oleobj for details'.format(
                ', '.join(rel_types))
            ext_rels.risk = RISK.HIGH
        return ext_rels
Пример #2
    def test_iter_subfiles(self):
        """ test that limitation on few subfiles works """
        testfile = join(DATA_BASE_DIR, 'msodde', 'dde-test.xlsx')
        subfiles = ['xl/theme/theme1.xml', 'docProps/app.xml']
        parser = ooxml.XmlParser(testfile)
        for subfile, elem, depth in parser.iter_xml(subfiles):
            if self.DO_DEBUG:
                print(u'{0} {1}{2}'.format(subfile, '  ' * depth,
            if subfile not in subfiles:
                self.fail('should have been skipped: {0}'.format(subfile))
            if depth == 0:

        self.assertEqual(subfiles, [],
                         'missed subfile(s) {0}'.format(subfiles))
Пример #3
def process_xlsx(filepath):
    """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for subfile, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == 'ddelink' or tag.endswith('}ddelink'):
            # we have found a dde link. Try to get more info about it
            link_info = ['DDE-Link']
            if 'ddeService' in elem.attrib:
            if 'ddeTopic' in elem.attrib:
            dde_links.append(u' '.join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
                'Parsing non-xml subfile {0} with content type {1}'.format(
                    subfile, content_type))
            for record in xls_parser.parse_xlsb_part(handle, content_type,
                logging.debug('{0}: {1}'.format(subfile, record))
                if isinstance(record, xls_parser.XlsbBeginSupBook) and \
                        record.link_type == \
                    dde_links.append('DDE-Link ' + record.string1 + ' ' +
        except Exception:
            if content_type.startswith('application/vnd.ms-excel.') or \
               content_type.startswith('application/vnd.ms-office.'):  # pylint: disable=bad-indentation
                # should really be able to parse these either as xml or records
                log_func = logging.warning
            elif content_type.startswith('image/') or content_type == \
                    'application/vnd.openxmlformats-officedocument.' + \
                # understandable that these are not record-base
                log_func = logging.debug
            else:  # default
                log_func = logging.info
            log_func('Failed to parse {0} of content type {1}'.format(
                subfile, content_type))
            # in any case: continue with next

    return u'\n'.join(dde_links)
Пример #4
def process_xlsx(filepath):
    """process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm)"""
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for _, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == "ddelink" or tag.endswith("}ddelink"):
            # we have found a dde link. Try to get more info about it
            link_info = []
            if "ddeService" in elem.attrib:
            if "ddeTopic" in elem.attrib:
            dde_links.append(u" ".join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
            logger.info("Parsing non-xml subfile {0} with content type {1}".format(subfile, content_type))
            for record in xls_parser.parse_xlsb_part(handle, content_type, subfile):
                logger.debug("{0}: {1}".format(subfile, record))
                if isinstance(record, xls_parser.XlsbBeginSupBook) and record.link_type == xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE:
                    dde_links.append(record.string1 + " " + record.string2)
        except Exception as exc:
            if content_type.startswith("application/vnd.ms-excel.") or content_type.startswith(
            ):  # pylint: disable=bad-indentation
                # should really be able to parse these either as xml or records
                log_func = logger.warning
            elif (
                or content_type == "application/vnd.openxmlformats-officedocument." + "spreadsheetml.printerSettings"
                # understandable that these are not record-base
                log_func = logger.debug
            else:  # default
                log_func = logger.info
            log_func('Failed to parse {0} of content type {1} ("{2}")'.format(subfile, content_type, str(exc)))
            # in any case: continue with next

    return u"\n".join(dde_links)
Пример #5
    def test_iter_tags(self):
        """ test that limitation to tags works """
        testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docm')
        nmspc = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
        tag = '{' + nmspc + '}p'

        parser = ooxml.XmlParser(testfile)
        n_found = 0
        for subfile, elem, depth in parser.iter_xml(tags=tag):
            n_found += 1
            self.assertEqual(elem.tag, tag)

            # also check that children are present
            n_children = 0
            for child in elem:
                n_children += 1
                self.assertFalse(child.tag == '')
                n_children > 0,
                'no children for elem {0}'.format(ooxml.debug_str(elem)))

        self.assertEqual(n_found, 7)
Пример #6
def process_excel_xml(filepath):
    """ find dde links in xml files created with excel 2003 or excel 2007+

    TODO: did not manage to create dde-link in the 2007+-xml-format. Find out
          whether this is possible at all. If so, extend this function
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for _, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag != 'cell' and not tag.endswith('}cell'):
            continue   # we are only interested in cells
        formula = None
        for key in elem.keys():
            if key.lower() == 'formula' or key.lower().endswith('}formula'):
                formula = elem.get(key)
        if formula is None:
        log.debug('found cell with formula {0}'.format(formula))
        match = re.match(XML_DDE_FORMAT, formula)
        if match:
            dde_links.append(u' '.join(match.groups()[:2]))
    return u'\n'.join(dde_links)
Пример #7
    def test_iter_all(self):
        """ test iter_xml without args """
        expect_subfiles = dict([
            ('[Content_Types].xml', 11),
            ('_rels/.rels', 4),
            ('word/_rels/document.xml.rels', 6),
            ('word/document.xml', 102),
            ('word/theme/theme1.xml', 227),
            ('word/settings.xml', 40),
            ('word/fontTable.xml', 25),
            ('word/webSettings.xml', 3),
            ('docProps/app.xml', 26),
            ('docProps/core.xml', 10),
            ('word/styles.xml', 441),
        n_elems = 0
        testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docx')
        for subfile, elem, depth in ooxml.XmlParser(testfile).iter_xml():
            n_elems += 1
            if depth > 0:

            # now depth == 0; should occur once at end of every subfile
            if subfile not in expect_subfiles:
                self.fail('Subfile {0} not expected'.format(subfile))
                n_elems, expect_subfiles[subfile],
                'wrong number of elems ({0}) yielded from {1}'.format(
                    n_elems, subfile))
            _ = expect_subfiles.pop(subfile)
            n_elems = 0

            len(expect_subfiles), 0,
            'Forgot to iterate through subfile(s) {0}'.format(
Пример #8
def process_xlsx(filepath, filed_filter_mode=None):
    """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for subfile, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == 'ddelink' or tag.endswith('}ddelink'):
            # we have found a dde link. Try to get more info about it
            link_info = ['DDE-Link']
            if 'ddeService' in elem.attrib:
            if 'ddeTopic' in elem.attrib:
            dde_links.append(u' '.join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
        if content_type == 'application/vnd.openxmlformats-officedocument.' + \
            continue   # printer settings
        if not content_type.startswith('application/vnd.ms-excel.') and \
           not content_type.startswith('application/vnd.ms-office.'):  # pylint: disable=bad-indentation
           logging.warning('Unexpected content type: ' + content_type)
               # try parsing anyway

        logging.info('Parsing non-xml subfile {0} with content type {1}'
                     .format(subfile, content_type))
        for record in xls_parser.parse_xlsb_part(handle, content_type, subfile):
            logging.debug('{0}: {1}'.format(subfile, record))
            if isinstance(record, xls_parser.XlsbBeginSupBook) and \
                    record.link_type == \
                dde_links.append('DDE-Link ' + record.string1 + ' ' +

    return u'\n'.join(dde_links)
Пример #9
def process_docx(filepath, field_filter_mode=None):
    """ find dde-links (and other fields) in Word 2007+ files """
    parser = ooxml.XmlParser(filepath)
    all_fields = []
    level = 0
    ddetext = u''
    for _, subs, depth in parser.iter_xml(tags=TAG_W_P + TAG_W_FLDSIMPLE):
        if depth == 0:   # at end of subfile:
            level = 0    # reset
        if subs.tag in TAG_W_FLDSIMPLE:
            # concatenate the attribute of the field, if present:
            attrib_instr = subs.attrib.get(ATTR_W_INSTR[0]) or \
            if attrib_instr is not None:

        # have a TAG_W_P
        for curr_elem in subs:
            # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
            elem = None
            if curr_elem.tag in TAG_W_R:
                for child in curr_elem:
                    if child.tag in TAG_W_FLDCHAR or \
                            child.tag in TAG_W_INSTRTEXT:
                        elem = child
                if elem is None:
                    continue   # no fldchar or instrtext in this w:r
                elem = curr_elem
            if elem is None:
                raise BadOOXML(filepath, 'Got "None"-Element from iter_xml')

            # check if FLDCHARTYPE and whether "begin" or "end" tag
            attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \
            if attrib_type is not None:
                if attrib_type == "begin":
                    level += 1
                if attrib_type == "end":
                    level -= 1
                    if level == 0 or level == -1:  # edge-case; level gets -1
                        ddetext = u''
                        level = 0  # reset edge-case

            # concatenate the text of the field, if present:
            if elem.tag in TAG_W_INSTRTEXT and elem.text is not None:
                # expand field code if QUOTED
                ddetext += unquote(elem.text)

    # apply field command filter
    log.debug('filtering with mode "{0}"'.format(field_filter_mode))
    if field_filter_mode in (FIELD_FILTER_ALL, None):
        clean_fields = all_fields
    elif field_filter_mode == FIELD_FILTER_DDE:
        clean_fields = [field for field in all_fields
                        if FIELD_DDE_REGEX.match(field)]
    elif field_filter_mode == FIELD_FILTER_BLACKLIST:
        # check if fields are acceptable and should not be returned
        clean_fields = [field for field in all_fields
                        if not field_is_blacklisted(field.strip())]
        raise ValueError('Unexpected field_filter_mode: "{0}"'

    return u'\n'.join(clean_fields)