Python remove_invalid_xml示例，hscommon.util.remove_invalid_xml Python示例

示例#1

0

显示文件

文件： app_pe.py 项目： badcure/dupeguru

def get_iphoto_or_aperture_pictures(plistpath: Path, photo_class):
    # The structure of iPhoto and Aperture libraries for the base photo list are excactly the same.
    if not plistpath.exists():
        return []
    s = plistpath.open("rt", encoding="utf-8").read()
    # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
    s = remove_invalid_xml(s, replace_with="")
    # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
    # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
    # bundle's regexp
    s, count = re.subn(r"&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)", "", s)
    if count:
        logging.warning("%d invalid XML entities replacement made", count)
    parser = IPhotoPlistParser()
    try:
        plist = parser.parse(io.BytesIO(s.encode("utf-8")))
    except Exception:
        logging.warning("iPhoto plist parsing choked on data: %r", parser.lastdata)
        raise
    result = []
    for key, photo_data in plist["Master Image List"].items():
        if photo_data["MediaType"] != "Image":
            continue
        photo_path = Path(photo_data["ImagePath"])
        photo = photo_class(photo_path, key)
        result.append(photo)
    return result

示例#2

0

显示文件

文件： app_pe.py 项目： legwak/dupeguru

def get_iphoto_or_aperture_pictures(plistpath: Path, photo_class):
    # The structure of iPhoto and Aperture libraries for the base photo list are excactly the same.
    if not plistpath.exists():
        return []
    s = plistpath.open('rt', encoding='utf-8').read()
    # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
    s = remove_invalid_xml(s, replace_with='')
    # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
    # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
    # bundle's regexp
    s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
    if count:
        logging.warning("%d invalid XML entities replacement made", count)
    parser = IPhotoPlistParser()
    try:
        plist = parser.parse(io.BytesIO(s.encode('utf-8')))
    except Exception:
        logging.warning("iPhoto plist parsing choked on data: %r",
                        parser.lastdata)
        raise
    result = []
    for key, photo_data in plist['Master Image List'].items():
        if photo_data['MediaType'] != 'Image':
            continue
        photo_path = Path(photo_data['ImagePath'])
        photo = photo_class(photo_path, key)
        result.append(photo)
    return result

示例#3

0

显示文件

def fix_text(text):
    # If we don't remove invalid XML characters, we'll get crashes on ebook creation and reloading
    # of masherproj files.
    text = remove_invalid_xml(text)

    # This search/replace function is based on heuristic discoveries from sample pdf I've received.
    # &dquo; comes from a pdf file with quotes in it. dquo is weird because it looks like an html
    # escape but it isn't. Anyway, just replace it with quotes.
    text = text.replace('&dquo;', '"')

    # We also want to normalize spaces, that is: remove double spaces and remove spaces after or
    # before a newline.
    text = RE_MULTIPLE_SPACES.sub(' ', text)
    text = RE_NEWLINE_AND_SPACE.sub('\n', text)
    return text

示例#4

0

显示文件

文件： pdf.py 项目： hsoft/pdfmasher

def fix_text(text):
    # If we don't remove invalid XML characters, we'll get crashes on ebook creation and reloading
    # of masherproj files.
    text = remove_invalid_xml(text)
    
    # This search/replace function is based on heuristic discoveries from sample pdf I've received.
    # &dquo; comes from a pdf file with quotes in it. dquo is weird because it looks like an html
    # escape but it isn't. Anyway, just replace it with quotes.
    text = text.replace('&dquo;', '"')
    
    # We also want to normalize spaces, that is: remove double spaces and remove spaces after or
    # before a newline.
    text = RE_MULTIPLE_SPACES.sub(' ', text)
    text = RE_NEWLINE_AND_SPACE.sub('\n', text)
    return text

示例#5

0

显示文件

文件： app_me.py 项目： LJnotions/dupeguru

def get_itunes_songs(plistpath):
    if not plistpath.exists():
        return []
    s = plistpath.open('rt', encoding='utf-8').read()
    # iTunes sometimes produces XML files with invalid characters in it.
    s = remove_invalid_xml(s, replace_with='')
    plist = plistlib.readPlistFromBytes(s.encode('utf-8'))
    result = []
    for song_data in plist['Tracks'].values():
        try:
            if song_data['Track Type'] != 'File':
                continue
            song = ITunesSong(song_data)
        except KeyError: # No "Track Type", "Location" or "Track ID" key in track
            continue
        if song.path.exists():
            result.append(song)
    return result

示例#6

0

显示文件

def get_itunes_songs(plistpath):
    if not plistpath.exists():
        return []
    s = plistpath.open('rt', encoding='utf-8').read()
    # iTunes sometimes produces XML files with invalid characters in it.
    s = remove_invalid_xml(s, replace_with='')
    plist = plistlib.readPlistFromBytes(s.encode('utf-8'))
    result = []
    for song_data in plist['Tracks'].values():
        try:
            if song_data['Track Type'] != 'File':
                continue
            song = ITunesSong(song_data)
        except KeyError:  # No "Track Type", "Location" or "Track ID" key in track
            continue
        if song.path.exists():
            result.append(song)
    return result

示例#7

0

显示文件

文件： app_pe.py 项目： astrofrog/dupeguru

def get_iphoto_or_aperture_pictures(plistpath, photo_class):
    # The structure of iPhoto and Aperture libraries for the base photo list are excactly the same.
    if not io.exists(plistpath):
        return []
    s = io.open(plistpath, 'rt', encoding='utf-8').read()
    # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
    s = remove_invalid_xml(s, replace_with='')
    # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
    # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
    # bundle's regexp
    s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
    if count:
        logging.warning("%d invalid XML entities replacement made", count)
    plist = plistlib.readPlistFromBytes(s.encode('utf-8'))
    result = []
    for key, photo_data in plist['Master Image List'].items():
        if photo_data['MediaType'] != 'Image':
            continue
        photo_path = Path(photo_data['ImagePath'])
        photo = photo_class(photo_path, key)
        result.append(photo)
    return result

示例#8

0

显示文件

文件： native.py 项目： tautonic/moneyguru

def save(filename, document_id, properties, accounts, groups, transactions,
         schedules, budgets):
    def date2str(date):
        return date.strftime('%Y-%m-%d')

    def handle_newlines(s):
        # etree doesn't correctly save newlines. In fields that allow it, we have to escape them so
        # that we can restore them during load.
        # XXX It seems like newer version of etree do escape newlines. When we use Python 3.2, we
        # can probably remove this.
        if not s:
            return s
        return s.replace('\n', '\\n')

    def setattrib(attribs, attribname, value):
        if value:
            attribs[attribname] = value

    def write_transaction_element(parent_element, transaction):
        transaction_element = ET.SubElement(parent_element, 'transaction')
        attrib = transaction_element.attrib
        attrib['date'] = date2str(transaction.date)
        setattrib(attrib, 'description', transaction.description)
        setattrib(attrib, 'payee', transaction.payee)
        setattrib(attrib, 'checkno', transaction.checkno)
        setattrib(attrib, 'notes', handle_newlines(transaction.notes))
        attrib['mtime'] = str(int(transaction.mtime))
        for split in transaction.splits:
            split_element = ET.SubElement(transaction_element, 'split')
            attrib = split_element.attrib
            attrib['account'] = split.account_name
            attrib['amount'] = format_amount(split.amount)
            setattrib(attrib, 'memo', split.memo)
            setattrib(attrib, 'reference', split.reference)
            if split.reconciliation_date is not None:
                attrib['reconciliation_date'] = date2str(
                    split.reconciliation_date)

    root = ET.Element('moneyguru-file')
    root.attrib['document_id'] = document_id
    props_element = ET.SubElement(root, 'properties')
    for name, value in properties.items():
        if name == 'default_currency':
            value = value.code
        else:
            value = str(value)
        props_element.attrib[name] = value
    for group in groups:
        group_element = ET.SubElement(root, 'group')
        attrib = group_element.attrib
        attrib['name'] = group.name
        attrib['type'] = group.type
    for account in accounts:
        account_element = ET.SubElement(root, 'account')
        attrib = account_element.attrib
        attrib['name'] = account.name
        attrib['currency'] = account.currency.code
        attrib['type'] = account.type
        if account.group:
            attrib['group'] = account.group.name
        if account.reference is not None:
            attrib['reference'] = account.reference
        if account.account_number:
            attrib['account_number'] = account.account_number
        if account.inactive:
            attrib['inactive'] = 'y'
        if account.notes:
            attrib['notes'] = handle_newlines(account.notes)
    for transaction in transactions:
        write_transaction_element(root, transaction)
    # the functionality of the line below is untested because it's an optimisation
    scheduled = [s for s in schedules if s.is_alive]
    for recurrence in scheduled:
        recurrence_element = ET.SubElement(root, 'recurrence')
        attrib = recurrence_element.attrib
        attrib['type'] = recurrence.repeat_type
        attrib['every'] = str(recurrence.repeat_every)
        if recurrence.stop_date is not None:
            attrib['stop_date'] = date2str(recurrence.stop_date)
        for date, change in recurrence.date2globalchange.items():
            change_element = ET.SubElement(recurrence_element, 'change')
            change_element.attrib['date'] = date2str(date)
            if change is not None:
                write_transaction_element(change_element, change)
        for date, exception in recurrence.date2exception.items():
            exception_element = ET.SubElement(recurrence_element, 'exception')
            exception_element.attrib['date'] = date2str(date)
            if exception is not None:
                write_transaction_element(exception_element, exception)
        write_transaction_element(recurrence_element, recurrence.ref)
    for budget in budgets:
        budget_element = ET.SubElement(root, 'budget')
        attrib = budget_element.attrib
        attrib['account'] = budget.account.name
        attrib['type'] = budget.repeat_type
        attrib['every'] = str(budget.repeat_every)
        attrib['amount'] = format_amount(budget.amount)
        attrib['notes'] = budget.notes
        if budget.target is not None:
            attrib['target'] = budget.target.name
        attrib['start_date'] = date2str(budget.start_date)
        if budget.stop_date is not None:
            attrib['stop_date'] = date2str(budget.stop_date)
    for elem in root.getiterator():
        attrib = elem.attrib
        for key, value in attrib.items():
            attrib[key] = remove_invalid_xml(value)
    tree = ET.ElementTree(root)
    ensure_folder(op.dirname(filename))
    fp = open(filename, 'wt', encoding='utf-8')
    fp.write('<?xml version="1.0" encoding="utf-8"?>\n')
    # This 'unicode' encoding thing is only available (and necessary) from Python 3.2
    if sys.version_info[1] >= 2:
        tree.write(fp, encoding='unicode')
    else:
        tree.write(fp)

示例#9

0

显示文件

文件： native.py 项目： daleathan/moneyguru

def save(filename, document_id, properties, accounts, groups, transactions, schedules, budgets):
    def date2str(date):
        return date.strftime('%Y-%m-%d')

    def handle_newlines(s):
        # etree doesn't correctly save newlines. In fields that allow it, we have to escape them so
        # that we can restore them during load.
        # XXX It seems like newer version of etree do escape newlines. When we use Python 3.2, we
        # can probably remove this.
        if not s:
            return s
        return s.replace('\n', '\\n')

    def setattrib(attribs, attribname, value):
        if value:
            attribs[attribname] = value

    def write_transaction_element(parent_element, transaction):
        transaction_element = ET.SubElement(parent_element, 'transaction')
        attrib = transaction_element.attrib
        attrib['date'] = date2str(transaction.date)
        setattrib(attrib, 'description', transaction.description)
        setattrib(attrib, 'payee', transaction.payee)
        setattrib(attrib, 'checkno', transaction.checkno)
        setattrib(attrib, 'notes', handle_newlines(transaction.notes))
        attrib['mtime'] = str(int(transaction.mtime))
        for split in transaction.splits:
            split_element = ET.SubElement(transaction_element, 'split')
            attrib = split_element.attrib
            attrib['account'] = split.account_name
            attrib['amount'] = format_amount(split.amount)
            setattrib(attrib, 'memo', split.memo)
            setattrib(attrib, 'reference', split.reference)
            if split.reconciliation_date is not None:
                attrib['reconciliation_date'] = date2str(split.reconciliation_date)

    root = ET.Element('moneyguru-file')
    root.attrib['document_id'] = document_id
    props_element = ET.SubElement(root, 'properties')
    for name, value in properties.items():
        if name == 'default_currency':
            value = value.code
        else:
            value = str(value)
        props_element.attrib[name] = value
    for group in groups:
        group_element = ET.SubElement(root, 'group')
        attrib = group_element.attrib
        attrib['name'] = group.name
        attrib['type'] = group.type
    for account in accounts:
        account_element = ET.SubElement(root, 'account')
        attrib = account_element.attrib
        attrib['name'] = account.name
        attrib['currency'] = account.currency.code
        attrib['type'] = account.type
        if account.group:
            attrib['group'] = account.group.name
        if account.reference is not None:
            attrib['reference'] = account.reference
        if account.account_number:
            attrib['account_number'] = account.account_number
        if account.inactive:
            attrib['inactive'] = 'y'
        if account.notes:
            attrib['notes'] = handle_newlines(account.notes)
    for transaction in transactions:
        write_transaction_element(root, transaction)
    # the functionality of the line below is untested because it's an optimisation
    scheduled = [s for s in schedules if s.is_alive]
    for recurrence in scheduled:
        recurrence_element = ET.SubElement(root, 'recurrence')
        attrib = recurrence_element.attrib
        attrib['type'] = recurrence.repeat_type
        attrib['every'] = str(recurrence.repeat_every)
        if recurrence.stop_date is not None:
            attrib['stop_date'] = date2str(recurrence.stop_date)
        for date, change in recurrence.date2globalchange.items():
            change_element = ET.SubElement(recurrence_element, 'change')
            change_element.attrib['date'] = date2str(date)
            if change is not None:
                write_transaction_element(change_element, change)
        for date, exception in recurrence.date2exception.items():
            exception_element = ET.SubElement(recurrence_element, 'exception')
            exception_element.attrib['date'] = date2str(date)
            if exception is not None:
                write_transaction_element(exception_element, exception)
        write_transaction_element(recurrence_element, recurrence.ref)
    for budget in budgets:
        budget_element = ET.SubElement(root, 'budget')
        attrib = budget_element.attrib
        attrib['account'] = budget.account.name
        attrib['type'] = budget.repeat_type
        attrib['every'] = str(budget.repeat_every)
        attrib['amount'] = format_amount(budget.amount)
        attrib['notes'] = budget.notes
        if budget.target is not None:
            attrib['target'] = budget.target.name
        attrib['start_date'] = date2str(budget.start_date)
        if budget.stop_date is not None:
            attrib['stop_date'] = date2str(budget.stop_date)
    for elem in root.getiterator():
        attrib = elem.attrib
        for key, value in attrib.items():
            attrib[key] = remove_invalid_xml(value)
    tree = ET.ElementTree(root)
    ensure_folder(op.dirname(filename))
    fp = open(filename, 'wt', encoding='utf-8')
    fp.write('<?xml version="1.0" encoding="utf-8"?>\n')
    # This 'unicode' encoding thing is only available (and necessary) from Python 3.2
    if sys.version_info[1] >= 2:
        tree.write(fp, encoding='unicode')
    else:
        tree.write(fp)