Python remove_excessive_whitespace示例

编程语言: Python

命名空间/包名称: typotools

方法/功能: remove_excessive_whitespace

hotexamples.com的示例: 2

Python remove_excessive_whitespace - 已找到2个示例。这些是从开源项目中提取的最受好评的typotools.remove_excessive_whitespace现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： chgk_composer.py 项目： peczony/chgksuite

def parse_4s_elem(s):
    
    def find_next_unescaped(ss, index):
        j = index + 1
        while j < len(ss):
            if ss[j] == '\\' and j+2 < len(ss):
                j += 2
            if ss[j] == ss[index]:
                return j
            j += 1
        return -1

    for gr in re_url.finditer(s):
        gr0 = gr.group(0)
        s = s.replace(gr0, gr0.replace('_', '\\_'))

    # for gr in re_scaps.finditer(s):
    #     gr0 = gr.group(0)
    #     s = s.replace(gr0, '(sc '+gr0.lower()+')')

    grs = sorted([match.group(0) 
        for match in re_perc.finditer(s)], key=len, reverse=True)
    for gr in grs:
        try:
            s = s.replace(gr,urllib.unquote(gr.encode('utf8')).decode('utf8'))
        except:
            debug_print('error decoding on line {}: {}\n'
                .format(gr, traceback.format_exc()))
    
    s = list(s)
    i = 0
    topart = []
    while i < len(s):
        if s[i] == '_' and (i == 0 or s[i-1] != '\\'):
            debug_print('found _ at {} of line {}'
                .format(i, s))
            topart.append(i)
            if find_next_unescaped(s, i) != -1:
                topart.append(find_next_unescaped(s, i)+1)
                i = find_next_unescaped(s, i) + 2
                continue
        if (s[i] == '(' and i + len('(img') < len(s) and ''.join(s[i:
                            i+len('(img')])=='(img'):
            debug_print('img candidate')
            topart.append(i)
            if not typotools.find_matching_closing_bracket(s, i) is None:
                topart.append(
                    typotools.find_matching_closing_bracket(s, i)+1)
                i = typotools.find_matching_closing_bracket(s, i)+2
        # if (s[i] == '(' and i + len('(sc') < len(s) and ''.join(s[i:
        #                     i+len('(sc')])=='(sc'):
        #     debug_print('sc candidate')
        #     topart.append(i)
        #     if not typotools.find_matching_closing_bracket(s, i) is None:
        #         topart.append(
        #             typotools.find_matching_closing_bracket(s, i)+1)
        #         i = typotools.find_matching_closing_bracket(s, i)+2
        i += 1

    topart = sorted(topart)

    parts = [['', ''.join(x)] for x in partition(s, topart)]
    debug_print(pprint.pformat(parts).decode('unicode_escape'))

    for part in parts:
        if part == ['', '']:
            continue
        try:
            if part[1][-1] == '_':
                part[1] = part[1][1:]
                part[0] = 'em'
            if part[1][-1] == '_':
                part[1] = part[1][:-1]
                part[0] = 'em'
            if len(part[1]) > 4 and part[1][:4] == '(img':
                if part[1][-1] != ')':
                    part[1] = part[1] + ')'
                part[1] = typotools.remove_excessive_whitespace(
                    part[1][4:-1])
                part[0] = 'img'
                debug_print('found img at {}'
                    .format(pprint.pformat(part[1])))
            if len(part[1]) > 3 and part[1][:4] == '(sc':
                if part[1][-1] != ')':
                    part[1] = part[1] + ')'
                part[1] = typotools.remove_excessive_whitespace(
                    part[1][3:-1])
                part[0] = 'sc'
                debug_print('found img at {}'
                    .format(pprint.pformat(part[1])))
            part[1] = part[1].replace('\\_', '_')
        except:
            sys.stderr.write('Error on part {}: {}'
                .format(pprint.pformat(part).decode('unicode_escape'),
                traceback.format_exc() ))

    return parts

示例#2

显示文件

文件： chgksuitelib.py 项目： peczony/chgksuite

    def __init__(self, text):
        """
        Parsing rationale: every Question has two required fields: 'question' 
        and the immediately following 'answer'. All the rest are optional, 
        as is the order of these fields. On the other hand, everything
        except the 'question' is obligatorily marked, while the 'question' is
        optionally marked. But IF the question is not marked, 'meta' comments
        between Questions will not be parsed as 'meta' but will be merged to
        'question's.
        Parsing is done by regexes in the following steps:

        1. Identify all the fields you can, mark them with their respective
            labels, mark all the others with ''
        2. Merge fields inside Question with '' lines between them
        3. Ensure every 'answer' has a 'question'
        4. Mark all remaining '' fields as 'meta'
        5. Prettify input
        6. Pack Questions into dicts
        7. Store the resulting structure as self.structure
        """

        self.structure = []

        # 1.

        for x in re.split(r'\r?\n',text):
            if x != '':
                self.structure.append(['',remove_excessive_whitespace(x)])

        i = 0
        st = self.structure
        while i < len(st):
            matching_regexes = {
                (regex, self.regexes[regex].search(st[i][1]).start(0)) 
                 for regex in self.regexes 
                 if self.regexes[regex].search(st[i][1])}
            
            # If more than one regex matches string, split it and 
            # insert into structure separately.
            
            if len(matching_regexes) == 1: 
                st[i][0] = matching_regexes.pop()[0]
            elif len(matching_regexes) > 1:
                sorted_r = sorted(matching_regexes, key=lambda x: x[1])
                slices = []
                for j in range(1, len(sorted_r)):
                    slices.append(
                        [sorted_r[j][0], st[i][1][
                            sorted_r[j][1] 
                             : 
                            sorted_r[j+1][1] if j+1 < len(sorted_r)
                                                    else len(st[i][1])]])
                for slice_ in slices:
                    self.structure.insert(
                        i+1, slice_)
                st[i][0] = sorted_r[0][0]
                st[i][1] = st[i][1][:sorted_r[1][1]]
            i += 1
        self.structure = st
        i = 0
            

        # 2.

        self.merge_y_to_x('question','answer')
        self.merge_to_x_until_nextfield('answer')
        self.merge_to_x_until_nextfield('comment')

        # 3.

        i = 0
        while i < len(self.structure):
            if (self.structure[i][0] == 'answer' 
                and self.structure[i-1][0] not in ('question',
                    'newquestion')):
                self.structure.insert(i,['newquestion',''])
                i = 0
            i += 1
        
        i = 0
        while i < len(self.structure) - 1:
            if (self.structure[i][0] == ''
                and self.structure[i+1][0] == 'newquestion'):
                self.merge_to_next(i)
                if (re_number.search(
                    remove_excessive_whitespace(self.structure[i][1])) 
                and not re_number.search(
                    remove_excessive_whitespace(self.structure[i-1][1]))):
                    self.structure[i][0] = 'question'
                    self.structure[i][1] = re_number.sub(
                        '',remove_excessive_whitespace(self.structure[i][1]))
                    try:
                        self.structure.insert(i, 
                            ['number', int(re_number.search(
                                remove_excessive_whitespace(
                                    self.structure[i][1])
                                ).group(0))])
                    except:
                        pass # TODO: figure out what this means
                i = 0
            i += 1

        for element in self.structure:
            if element[0] == 'newquestion':
                element[0] = 'question'

        self.dirty_merge_to_x_until_nextfield('source')

        for id, element in enumerate(self.structure):
            if (element[0] == 'author' and re.search(r'^{}$'.format(re_author.
                pattern),
                remove_excessive_whitespace(element[1]))
                and id + 1 < len(self.structure)):
                merge_to_previous(id+1)
        
        self.merge_to_x_until_nextfield('accept')
        self.merge_to_x_until_nextfield('reject')
        
        # 4.

        self.structure = [x for x in self.structure 
            if [x[0], remove_excessive_whitespace(x[1])]
            != ['', '']]

        if self.structure[0][0] == '' and re_number.search(
            remove_excessive_whitespace(self.structure[0][1])):
            self.merge_to_next(0)

        for id, element in enumerate(self.structure):
            if element[0] == '':
                element[0] = 'meta'
            if (element[0] in self.regexes 
                and element[0] not in ['tour', 'tourrev']):
                if element[0] == 'question':
                    try:
                        num = re_question.search(element[1]).group(1)
                        self.structure.insert(id, ['number', num])
                    except:
                        pass
                element[1] = self.regexes[element[0]].sub('', element[1])

        # 5.

        for id, element in enumerate(self.structure):
            
            # typogrify

            if element[0] != 'date':
                element[1] = recursive_typography(element[1])

            # remove question numbers

            if element[0] == 'question':
                try:
                    num = re_question.search(element[1]).group(1)
                    self.structure.insert(id, ['number', num])
                except:
                    pass
                element[1] = re_number.sub('', element[1])
            
            # detect inner lists

            mo = {m for m 
                in re.finditer(r'(\s+|^)(\d+)[\.\)]\s*(?!\d)',
                element[1], re.U)}
            if len(mo) > 1:
                sorted_up = sorted(mo, key=lambda m: int(m.group(2)))
                j = 0
                list_candidate = []
                while j == int(sorted_up[j].group(2)) - 1:
                    list_candidate.append((j+1, sorted_up[j].group(0), 
                        sorted_up[j].start()))
                    if j+1 < len(sorted_up):
                        j += 1
                    else:
                        break
                if len(list_candidate) > 1:
                    if (element[0] != 'question' or 
                        (element[0] == 'question'
                            and 'дуплет' in element[1].lower() 
                                or 'блиц' in element[1].lower())):
                        part = partition(element[1], [x[2] for x in
                            list_candidate])
                        lc = 0
                        while lc < len(list_candidate):
                            part[lc+1] = part[lc+1].replace(
                                list_candidate[lc][1], '')
                            lc += 1
                        element[1] = ([part[0], part[1:]] if part[0] != ''
                                                else part[1:])

            # turn source into list if necessary
            if (element[0] == 'source' and isinstance(element[1], basestring)
                        and len(re.split(r'\r?\n', element[1])) > 1):
                element[1] = [re_number.sub('', remove_excessive_whitespace(x)) 
                    for x in re.split(r'\r?\n', element[1])]

        # 6.
        final_structure = []
        current_question = {}

        for element in self.structure:
            if element[0] in set(['tour', 'question', 'meta']): 
                if current_question != {}:
                    check_question(current_question)
                    final_structure.append(Question(**current_question))
                    current_question = {}
            if element[0] in QUESTION_LABELS:
                if element[0] in current_question:
                    try:
                        current_question[element[0]] += SEP + element[1]
                    except:
                        print('{}'.format(
                            current_question).decode('unicode_escape'))
                        pdb.set_trace()
                else:
                    current_question[element[0]] = element[1]
            else:
                final_structure.append([element[0], element[1]])
        if current_question != {}:
            check_question(current_question)
            final_structure.append(Question(**current_question))

        # 7.
        self.structure = final_structure