Python sub примеры, regex.regex.sub Python примеры использования

Пример #1

0

Показать файл

    def sanatize_paragraph(paragraph) -> list:
        """
        Entfernung von zeichen, zahlen, überzähligen Leerzeichen
        :param paragraph: Liste von Strings
        :return: Bereinigte Liste von Strings
        """
        sanatized = list()
        for ind, itm in enumerate(paragraph):
            if itm:
                itm = re.sub(r"\.", " ", itm)  # Replace dots with whitespaces
                itm = re.sub(
                    r"(?=[^ ])\P{L}", " ",
                    itm)  # Replace all non-word chars with whitespaces
                itm = re.sub(r" {2,}", " ", itm)  # Strip excess whitespaces

                if not itm or len(itm.strip()) <= 3:
                    continue
                sanatized.append(itm.strip().lower())

            if not itm and ind == 0:
                return list()
        if len(sanatized) == 1:
            return list()

        return sanatized

Пример #2

0

Показать файл

Файл: transcript.py Проект: rxhmdia/Speech_Recognition_Macedonian

def format_transcript(path):
    """
    This function is for converting the transcript text to all uppercase letters and removing all special characters (except for whitespace and newline characters), as per agreed upon convention.

    Parameters:
        path (string): String variable containing the path to the transcript .txt file

    Returns:
        Creates a new .txt file containing the formatted contents of the original

    """

    src = open(path, mode='r', encoding='utf-8')
    dst = open(path[:-4] + '-formatted.txt', mode='w', encoding='utf-8')

    transcript = src.read()

    transcript_array = [
        element.split(' ', 1) for element in transcript.strip().split('\n')
    ]

    if is_indexed(transcript_array):
        transcript = re.sub('[^\\p{L} \n\\d-]', '', transcript)
    else:
        transcript = re.sub('[^\\p{L} \n]', '', transcript)

    transcript = transcript.upper()

    dst.write(transcript)

    src.close()
    dst.close()

Пример #3

0

Показать файл

Файл: PathAnt.py Проект: c0ntradicti0n/LayoutEagle

    def __init__(self, necessary_paths={config.hidden_folder: ["tex_data", "cache", "log", "topics"]}):
        make_dirs_recursive(necessary_paths)
        self.G = nx.MultiDiGraph()

        for _from, _to, functional_object in ____CONVERTERS____:
            self.add_edge(_from, _to, functional_object)
            self.add_starred(_from, _to, functional_object, ____CONVERTERS____)
            functional_object.ant = self

        for (_froms1, _tos1, functional_object1), \
            (_froms2, _tos2, functional_object2) \
                in itertools.permutations(____CONVERTERS____, 2):

            for (_to1, _from1, _to2, _from2) in list_or_values(_tos1, _froms1, _tos2, _froms2):
                if _from1 == None:
                    _from1 =  OUT_OF_THE_BOX
                if _from2 == None:
                    _from2 =  OUT_OF_THE_BOX

                try:
                    if match(_to1, _from2):
                        self.add_edge(_to1, regex.sub(_from2 + '$', _to2, _to1), functional_object2)
                    if match(_to2, _from1):
                        self.add_edge(_to2, regex.sub(_from1 + '$', _to1, _to2), functional_object1)
                except Exception as e:
                    logging.error(f"_to1 = {_to1}")
                    logging.error(
                        f"failing to compare {_to1} and {_to2} and {_from1} and {_from2} as regexes because {e}")

Пример #4

0

Показать файл

Файл: statutes_parse.py Проект: QuantLaw/quantlaw

 def fix_errors_in_citation(citation):
     """
     Fix some common inconsistencies in the references such as double spaces.
     """
     result = regex.sub(r"\s+", " ", citation)
     result = regex.sub(r"§(?=\d)", "§ ", result)
     result = regex.sub(r",\sbis\s", " bis ", result)
     return result

Пример #5

0

Показать файл

Файл: hw4.py Проект: sm-richards/trek-ner

 def extract(
     self,
     token: str,
     current_idx: int,
     relative_idx: int,
     tokens: Sequence[str],
     features: Dict[str, float],
 ):
     shape = regex.sub(
         UPPERCASE_RE, 'X',
         regex.sub(LOWERCASE_RE, 'x', re.sub(DIGIT_RE, '0', token)))
     features["shape[" + str(relative_idx) + "]=" + shape] = 1.0

Пример #6

0

Показать файл

def remove_common_sub(domains):
    """
    Remove www. and m. subdomains
    """
    pattern = re.compile(r"^(?>www\.|m\.)")
    domains = [re.sub(pattern, "", x, concurrent=True) for x in domains]
    return set(domains)

Пример #7

0

Показать файл

def add_whitespace_after_punctuation_marks(s: Text) -> Text:
    """
    >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.')
    'Живи еще хоть четверть века— Всё будет так. Исхода нет. '
    """
    return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ',
                     s)

Пример #8

0

Показать файл

def delete_quote_links(text: str, tweet):
    if is_quote(tweet):
        text = regex.sub(get_tweet_url(tweet.quoted_status),
                         '',
                         text,
                         flags=regex.IGNORECASE)

    return text

Пример #9

0

Показать файл

Файл: Anime.py Проект: GhassenAB/scraping_animes

 def tojson(self):
     return {
         'title': regex.sub(' +', ' ', self.title.strip()),
         'link': self.link,
         'cover': self.imageUrl,
         'details': self.details,
         'screens': self.screens,
         'links': self.links
     }

Пример #10

0

Показать файл

def extract_abp(content):
    """Extracts blocked and unblocked domains from ABP style content."""
    pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1)
    pattern_supported_block = re.compile(
        r"^\|\|.+\^(?>$|.+(?:"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|"
        r"\bdocument\b|"
        r"\ball\b"
        # r"\ball\b|"
        # r"\bpopup\b"
        r"))",
        re.V1,
    )
    pattern_scrub_blocked_list = [
        r"^\|\|",
        r"\^($|.+(?>"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|\bdocument\b|"
        r"\ball\b|"
        r"\bpopup\b|"
        r"\S+))",
    ]
    pattern_scrub_blocked = re.compile(
        "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1
    )
    block_rules = [
        x
        for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]

    blocked_domains = [
        re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules
    ]
    blocked_domains = [x for x in blocked_domains if valid_domain(x)]
    pattern_supported_unblock = re.compile(r"@@\|\|.+\^$")
    unblock_rules = [
        x
        for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]
    unblocked_domains = [
        x.replace("@@||", "").replace("^", "").replace("$important", "")
        for x in unblock_rules
    ]
    regex_rules = []
    return blocked_domains, unblocked_domains, unblock_rules, regex_rules

Пример #11

0

Показать файл

Файл: generator.py Проект: arapurayil/aBL

def concat_category(out_file):
    """Concatenate category README.md files"""
    files = glob(f"{DirPath.input}/*/*.md")
    files = sorted(files, key=lambda x: x)
    files = sorted(files, key=lambda x: x.__contains__("regional"))
    files = sorted(files, key=lambda x: x.__contains__("main"), reverse=True)
    for file in files:
        with open(file, encoding="utf-8") as file_input:
            with open(out_file, "a", encoding="utf-8") as file_output:
                lines = (re.sub(r"^#", r"##", x)
                         if re.match(r"^#{0,6}+\s", x) else x
                         for x in file_input)
                file_output.writelines(lines)

Пример #12

0

Показать файл

def fix_escape_characters(text: str):
    text = text.replace('&amp;', '\&')
    text = text.replace('&lt;', '\<')
    text = text.replace('&gt;', '\>')

    # Escape Discord's markdown
    text = text.replace('`', '\`')
    text = text.replace('*', '\*')
    text = text.replace('~', '\~')

    # Special exception for underscore because Twitter user names may contain them
    text = regex.sub(r'(?<!@\S*)_', '\_', text)

    return text

Пример #13

0

Показать файл

    def _parse_infobox(self, text, title):
        result = []
        text = regex.sub(r'\n ?\|', '\n|', text)
        lines = text.split('\n|')
        for line in lines:
            date_in_text = self.find_date(line)
            if date_in_text:
                info = [x.strip() for x in line.split('=')]
                result.append(
                    Index(token=title,
                          date=date_in_text.date,
                          info=info[0].replace('\n', '')))

        return result

Пример #14

0

Показать файл

Файл: editable_label.py Проект: c0ntradicti0n/CorpusCookApp

 def on_edit(self, instance, value):
     if not value:
         if self.textinput:
             self.remove_widget(self.textinput)
         return
     unformatted_text = regex.sub(self.unformat_bbcode, "", self.text)
     self.textinput = t = SelectableLabel(text=unformatted_text,
                                          size_hint=(None, None),
                                          font_size=self.font_size,
                                          font_name=self.font_name,
                                          pos=self.pos,
                                          size=self.size,
                                          multiline=False)
     self.bind(pos=t.setter('pos'), size=t.setter('size'))
     self.add_widget(self.textinput)
     t.bind(on_text_validate=self.on_text_validate,
            focus=self.on_text_focus)

Пример #15

0

Показать файл

Файл: load_data.py Проект: piotrgajdzica/nlp

def bills():
    data_dir = '../../lab1/data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)

            bill = open(os.path.join(data_dir, directory), encoding='UTF-8').read()
            text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
            # print(text[:400])

            r = regex.match(
                r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)',
                text)

            if r is None:
                yield bill, "", "", "", "f"
            else:
                yield bill, r.group("title"), r.group("journal_year"), r.group("position"), directory.split('.')[0]

Пример #16

0

Показать файл

Файл: generator.py Проект: arapurayil/aBL

def extract_hosts(content, list_type):
    """Extracts blocked or unblocked domains from hosts/domains style content."""
    pattern_scrub = [
        r"(?>\#|\!|\s+\#|\s+\!).*",
        r"^\s",
        r".*\blocalhost\b.*",
        r"^\d*\.\d*\.\d*\.\d*\s*(?>\s|www\.|m\.)",
        r"^(?>www\.|m\.)",
    ]
    pattern = re.compile("|".join(f"(?:{p})" for p in pattern_scrub), re.V1)
    domains = [re.sub(pattern, "", x, concurrent=True) for x in content]
    domains = [x for x in domains if valid_domain(x)]
    blocked_domains, unblocked_domains = [], []
    if list_type == "unblock":
        unblocked_domains = domains
    if list_type == "block":
        blocked_domains = domains

    return blocked_domains, unblocked_domains

Пример #17

0

Показать файл

def replace_hashtag_with_link(text: str, hashtag_entities=None):
    if hashtag_entities is not None:
        hashtags_sorted = sorted(hashtag_entities,
                                 key=lambda x: x['indices'][0],
                                 reverse=True)

        for hashtag in hashtags_sorted:
            start, end = hashtag['indices']

            # text[start] is either '#' or '＃', so this preserves the original character used
            hashtag_text = text[start] + hashtag['text']
            text = text[0:start] + get_named_link(
                hashtag_text, get_hashtag_url(hashtag_text)) + text[end:]
    else:
        hashtags = regex.findall(r'(?:[#|＃])[^\d\W][\w]*', text)
        for hashtag in hashtags:
            text = regex.sub(
                regex.escape(hashtag),
                fr'{get_named_link(hashtag, get_hashtag_url(hashtag))}', text)

    return text

Пример #18

0

Показать файл

def replace_mention_with_link(text: str,
                              user_mentions_entities,
                              in_reply_to_screen_name: str = None):
    if not user_mentions_entities:
        return text

    for mention in user_mentions_entities:
        mention_text = '@' + mention['screen_name']

        if in_reply_to_screen_name and mention[
                'screen_name'] == in_reply_to_screen_name:
            text = regex.sub(regex.escape(mention_text),
                             '',
                             text,
                             flags=regex.IGNORECASE)
        else:
            text = text.replace(
                mention_text,
                get_named_link(
                    mention_text,
                    get_profile_url(screen_name=mention['screen_name'])))

    return text

Пример #19

0

Показать файл

    def parse_text(self):
        results = []
        self.text = regex.sub(r'&lt;ref.*\n?.*&lt;/ref&gt;',
                              repl="",
                              string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                if date_in_text.end + look_after > len(sentence):
                    token = self.find_token(sentence[start:],
                                            date_in_text.start,
                                            date_in_text.end)
                else:
                    token = self.find_token(
                        sentence[start:date_in_text.end + look_after],
                        date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z1-9.!?:%$ ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=token if token else self.title,
                          date=date_in_text.date,
                          info=token_context))

        return results

Пример #20

0

Показать файл

from typing import List, Callable, Text
from regex import regex

PUNCTUATION_MARKS_REGEX = r'\.,!?:;\"\-—'
assert regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '',
                 PUNCTUATION_MARKS_REGEX.replace('\\', '')) == ''


def add_whitespace_after_punctuation_marks(s: Text) -> Text:
    """
    >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.')
    'Живи еще хоть четверть века— Всё будет так. Исхода нет. '
    """
    return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ',
                     s)


def remove_punctuation_marks(s: Text) -> Text:
    """
    >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"')
    'Good morning gentlemen wordwordwordword Text'
    """
    return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)


def keep_only_words(s: Text):  # TODO: not working yet
    return regex.sub(rf'\W+|\S+', '', s)


def remove_extra_whitespaces(s: Text) -> Text:
    """

Пример #21

0

Показать файл

 def normalize(in_data):
     """Cleans the filterlist file."""
     re.sub(r"\r", "", in_data)
     re.sub(r"\n+", "\n", in_data)
     return re.sub(checksum_pattern, "", in_data)

Пример #22

0

Показать файл

def keep_only_words(s: Text):  # TODO: not working yet
    return regex.sub(rf'\W+|\S+', '', s)

Пример #23

0

Показать файл

def remove_punctuation_marks(s: Text) -> Text:
    """
    >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"')
    'Good morning gentlemen wordwordwordword Text'
    """
    return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)

Пример #24

0

Показать файл

Файл: utils.py Проект: joshuanazareth97/popviz

def format_filename(string):
    string = string.lower()
    filename = re.sub(r"[<>:\'\"\/\|?.*]", "", string)
    filename = filename.replace(" ", "_")
    return filename

Пример #25

0

Показать файл

    data_dir = '../data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)
            yield open(os.path.join(data_dir, directory), encoding='UTF-8').read()



if __name__ == '__main__':
    b = {}
    for year in range(1900, 2500):
        b[str(year)] = {}

    for bill in bills():

        text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
        # print(text[:400])

        r = regex.match(r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text)
        # r = regex.match(r'\n*(Dz\.U\.z(?P<journal_year>\d+)r\.(N|n)r(?P<journal_number>\d+),?poz.(?P<position>\d+).?)?([a-żA-Ż\d\.\(\)]*\n?){0,4}\n*(ustawa|USTAWA|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\n*z\n?dnia\n?\d{1,2}\n?[a-żA-Ź]*\n?(?P<year>\d{4})\n?r\.\n*(?P<title>(.+\n)*?)\n*?(?P<title2>(.+\n)*?)\n*?(Rozdział(1|I)|Art.\n?(1|l)[^\d]|TYTUŁI|DziałI|częśćogólna)', text)
        # print(title.group())

        position = r.group("position")
        year = r.group("journal_year") or r.group("year")
        b[year][position] = {}
        b[year][position]["counter"] = 0
        b[year][position]["title"] = r.group("title")
        b[year][position]["journal_number"] = r.group("journal_number")
        b[year][position]["journal_year"] = r.group("journal_year")
        b[year][position]["year"] = r.group("year")
        b[year][position]["position"] = position

Пример #26

0

Показать файл

Файл: main.py Проект: Janci144/VINF-calendar-wiki

    def parse_text(self):
        results = []
        self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                # if date_in_text.end + look_after > len(sentence):
                #     token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end)
                # else:
                #     token = self.find_token(sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z0-9.!?:%$;, ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=self.title,
                          date=date_in_text.date,
                          info=token_context))

                #  I couldnt find best word that explain the purpose, often the result was meaningful, therefore I
                #  decided not to use it.

                # tokenized = nltk.pos_tag(nltk.word_tokenize(sentence))
                #
                # proper_nouns = []
                # nouns = []
                # for (word, pos) in tokenized:
                #     if pos == 'NNP':
                #         proper_nouns.append(word)
                #     elif pos == 'NN':
                #         nouns.append(word)
                #
                # results.append(Index(token=proper_nouns[0] if proper_nouns else "title", date=date_in_text.date, info=proper_nouns[1] if
                # len(proper_nouns) > 1 else nouns[0] if nouns else ""))

        return results

Пример #27

0

Показать файл

Файл: statutes_parse.py Проект: QuantLaw/quantlaw

    def split_citation_part(string: str):
        """
        A string a tokenizes. Tokens are identified as units or values. Pairs are
        built to connect the units with their respective values. If the unit cannot
        be indentified (and must be inferred later) None is returned.

        Args:
            string: A string that is part of a reference and cites *one* part a statute.

        Retruns: As a generator tuples are returned, each containing the unit (or None)
            and the respecive value.
        """

        # Tokenization

        # fmt: off
        string = regex.sub(
            r"("
            r"\d+(?>\.\d+)?[a-z]?|"
            r"\b[ivx]+|"
            r"\b[a-z]\)?"
            r")"
            r"(\sff?\.|\sff\b)",
            r"\1ff.",
            string,
            flags=regex.IGNORECASE,
        )
        # fmt: on
        tokens = split_unit_number_pattern.split(string, )

        # Building pairs of units with their resp. values

        while len(tokens) > 0:
            token = tokens.pop(0)
            if StatutesParser.is_unit(token):
                if len(tokens) > 0:
                    unit = StatutesParser.stem_unit(token)
                    token = tokens.pop(0)
                    numb = token
                    assert StatutesParser.is_numb(numb), numb
                else:  # when citation ends with unit
                    print(
                        f"Citation {string} ends with unit {token}. Ignoring last unit."
                    )
                    break

            elif StatutesParser.is_pre_numb(token):
                numb = token
                token = tokens.pop(0)
                if not StatutesParser.is_unit(token):
                    print(token, "is not a unit in", string)
                    continue
                    # to fix citation "§ 30 DRITTER ABSCHNITT"
                    # Last part in now ignored,
                    # but reference areas can still be improved.
                unit = StatutesParser.stem_unit(token)

            elif StatutesParser.is_numb(token):
                unit = None
                numb = token
            else:
                raise StringCaseException(token, "in", string)
            numb = regex.sub(r"(ff?\.|ff|\))$", "", numb)
            yield [unit, numb]

Пример #28

0

Показать файл

def remove_extra_whitespaces(s: Text) -> Text:
    """
    >>> remove_extra_whitespaces(' Out   on the tar plains  The glides   are moving ')
    ' Out on the tar plains The glides are moving '
    """
    return regex.sub(r'\s{2,}', ' ', s)

Пример #29

0

Показать файл

 def zeroize(sample):
     return "\n".join([regex.sub(CONLL_LINE, r"\1  \2  O  O", line) for line in sample.split('\n')])

Пример #30

0

Показать файл

 def clean(self, text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = "".join([c for c in text if c in self.allowed_chars])
     text = re.sub(r'\s+', ' ', text)
     return text.strip()

Python sub примеры использования