示例#1
0
def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
示例#2
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
示例#3
0
文件: parse.py 项目: ssunqf/mana
def parse(file_name: str):
    file_name = fixes.fix_character_width(file_name)

    infos = {}
    masks = np.zeros(len(file_name), dtype='bool')
    clean_name = file_name.replace('_', ' ')
    for name, val_type, pattern, norm_func in patterns:
        matches = re.findall(pattern, clean_name)
        if len(matches) == 0:
            continue

        for match in matches:
            index = clean_name.index(match[0])
            masks[index:index+len(match[0])] = True

            set_field(infos, name, val_type, norm_func(match[1]))

    def normalize(part: str):
        # title
        title = part.split('(')[0]
        if title.startswith('- '):
            title = title[2:]
        if '.' in title and ' ' not in title:
            title = title.replace('.', ' ')
        title = title.replace('_', ' ')
        return title

    for i, c in enumerate(file_name):
        if c in '()[]()【】“”"':
            masks[i] = True
    start = 0
    phrases = []
    while start < len(file_name):
        if not masks[start]:
            end = start + 1
            while end < len(masks) and not masks[end]:
                end += 1
            phrase = normalize(file_name[start:end])
            phrases.append(phrase.strip('._-@[]【 】()()'))
            start = end
        else:
            start += 1
    if len(phrases) > 0:
        set_field(infos, 'Title', str, max(phrases, key=lambda p: len(p)))

    return infos, [p for p in phrases if len(p) > 0]
示例#4
0
def make_tsquery(query: str):
    words = tokenize(fixes.fix_character_width(query))
    return ' & '.join(
        [word for word in words if word not in stopwords and word.isalnum()])