示例#1
0
def process_xlsx(filepath):
    """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for subfile, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == 'ddelink' or tag.endswith('}ddelink'):
            # we have found a dde link. Try to get more info about it
            link_info = ['DDE-Link']
            if 'ddeService' in elem.attrib:
                link_info.append(elem.attrib['ddeService'])
            if 'ddeTopic' in elem.attrib:
                link_info.append(elem.attrib['ddeTopic'])
            dde_links.append(u' '.join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
        try:
            logging.info(
                'Parsing non-xml subfile {0} with content type {1}'.format(
                    subfile, content_type))
            for record in xls_parser.parse_xlsb_part(handle, content_type,
                                                     subfile):
                logging.debug('{0}: {1}'.format(subfile, record))
                if isinstance(record, xls_parser.XlsbBeginSupBook) and \
                        record.link_type == \
                        xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE:
                    dde_links.append('DDE-Link ' + record.string1 + ' ' +
                                     record.string2)
        except Exception:
            if content_type.startswith('application/vnd.ms-excel.') or \
               content_type.startswith('application/vnd.ms-office.'):  # pylint: disable=bad-indentation
                # should really be able to parse these either as xml or records
                log_func = logging.warning
            elif content_type.startswith('image/') or content_type == \
                    'application/vnd.openxmlformats-officedocument.' + \
                    'spreadsheetml.printerSettings':
                # understandable that these are not record-base
                log_func = logging.debug
            else:  # default
                log_func = logging.info
            log_func('Failed to parse {0} of content type {1}'.format(
                subfile, content_type))
            # in any case: continue with next

    return u'\n'.join(dde_links)
示例#2
0
def process_xlsx(filepath):
    """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for _, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == 'ddelink' or tag.endswith('}ddelink'):
            # we have found a dde link. Try to get more info about it
            link_info = ['DDE-Link']
            if 'ddeService' in elem.attrib:
                link_info.append(elem.attrib['ddeService'])
            if 'ddeTopic' in elem.attrib:
                link_info.append(elem.attrib['ddeTopic'])
            dde_links.append(u' '.join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
        try:
            logger.info('Parsing non-xml subfile {0} with content type {1}'
                         .format(subfile, content_type))
            for record in xls_parser.parse_xlsb_part(handle, content_type,
                                                     subfile):
                logger.debug('{0}: {1}'.format(subfile, record))
                if isinstance(record, xls_parser.XlsbBeginSupBook) and \
                        record.link_type == \
                        xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE:
                    dde_links.append('DDE-Link ' + record.string1 + ' ' +
                                     record.string2)
        except Exception:
            if content_type.startswith('application/vnd.ms-excel.') or \
               content_type.startswith('application/vnd.ms-office.'):  # pylint: disable=bad-indentation
                # should really be able to parse these either as xml or records
                log_func = logger.warning
            elif content_type.startswith('image/') or content_type == \
                    'application/vnd.openxmlformats-officedocument.' + \
                    'spreadsheetml.printerSettings':
                # understandable that these are not record-base
                log_func = logger.debug
            else:   # default
                log_func = logger.info
            log_func('Failed to parse {0} of content type {1} ("{2}")'
                     .format(subfile, content_type, str(exc)))
            # in any case: continue with next

    return u'\n'.join(dde_links)
示例#3
0
def process_xlsx(filepath):
    """process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm)"""
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for _, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == "ddelink" or tag.endswith("}ddelink"):
            # we have found a dde link. Try to get more info about it
            link_info = []
            if "ddeService" in elem.attrib:
                link_info.append(elem.attrib["ddeService"])
            if "ddeTopic" in elem.attrib:
                link_info.append(elem.attrib["ddeTopic"])
            dde_links.append(u" ".join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
        try:
            logger.info("Parsing non-xml subfile {0} with content type {1}".format(subfile, content_type))
            for record in xls_parser.parse_xlsb_part(handle, content_type, subfile):
                logger.debug("{0}: {1}".format(subfile, record))
                if isinstance(record, xls_parser.XlsbBeginSupBook) and record.link_type == xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE:
                    dde_links.append(record.string1 + " " + record.string2)
        except Exception as exc:
            if content_type.startswith("application/vnd.ms-excel.") or content_type.startswith(
                "application/vnd.ms-office."
            ):  # pylint: disable=bad-indentation
                # should really be able to parse these either as xml or records
                log_func = logger.warning
            elif (
                content_type.startswith("image/")
                or content_type == "application/vnd.openxmlformats-officedocument." + "spreadsheetml.printerSettings"
            ):
                # understandable that these are not record-base
                log_func = logger.debug
            else:  # default
                log_func = logger.info
            log_func('Failed to parse {0} of content type {1} ("{2}")'.format(subfile, content_type, str(exc)))
            # in any case: continue with next

    return u"\n".join(dde_links)
示例#4
0
def process_xlsx(filepath, filed_filter_mode=None):
    """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
    dde_links = []
    parser = ooxml.XmlParser(filepath)
    for subfile, elem, _ in parser.iter_xml():
        tag = elem.tag.lower()
        if tag == 'ddelink' or tag.endswith('}ddelink'):
            # we have found a dde link. Try to get more info about it
            link_info = ['DDE-Link']
            if 'ddeService' in elem.attrib:
                link_info.append(elem.attrib['ddeService'])
            if 'ddeTopic' in elem.attrib:
                link_info.append(elem.attrib['ddeTopic'])
            dde_links.append(u' '.join(link_info))

    # binary parts, e.g. contained in .xlsb
    for subfile, content_type, handle in parser.iter_non_xml():
        if content_type == 'application/vnd.openxmlformats-officedocument.' + \
                           'spreadsheetml.printerSettings':
            continue   # printer settings
        if not content_type.startswith('application/vnd.ms-excel.') and \
           not content_type.startswith('application/vnd.ms-office.'):  # pylint: disable=bad-indentation
           logging.warning('Unexpected content type: ' + content_type)
               # try parsing anyway

        logging.info('Parsing non-xml subfile {0} with content type {1}'
                     .format(subfile, content_type))
        for record in xls_parser.parse_xlsb_part(handle, content_type, subfile):
            logging.debug('{0}: {1}'.format(subfile, record))
            if isinstance(record, xls_parser.XlsbBeginSupBook) and \
                    record.link_type == \
                    xls_parser.XlsbBeginSupBook.LINK_TYPE_DDE:
                dde_links.append('DDE-Link ' + record.string1 + ' ' +
                                 record.string2)

    return u'\n'.join(dde_links)