def sanatize_paragraph(paragraph) -> list: """ Entfernung von zeichen, zahlen, überzähligen Leerzeichen :param paragraph: Liste von Strings :return: Bereinigte Liste von Strings """ sanatized = list() for ind, itm in enumerate(paragraph): if itm: itm = re.sub(r"\.", " ", itm) # Replace dots with whitespaces itm = re.sub( r"(?=[^ ])\P{L}", " ", itm) # Replace all non-word chars with whitespaces itm = re.sub(r" {2,}", " ", itm) # Strip excess whitespaces if not itm or len(itm.strip()) <= 3: continue sanatized.append(itm.strip().lower()) if not itm and ind == 0: return list() if len(sanatized) == 1: return list() return sanatized
def format_transcript(path): """ This function is for converting the transcript text to all uppercase letters and removing all special characters (except for whitespace and newline characters), as per agreed upon convention. Parameters: path (string): String variable containing the path to the transcript .txt file Returns: Creates a new .txt file containing the formatted contents of the original """ src = open(path, mode='r', encoding='utf-8') dst = open(path[:-4] + '-formatted.txt', mode='w', encoding='utf-8') transcript = src.read() transcript_array = [ element.split(' ', 1) for element in transcript.strip().split('\n') ] if is_indexed(transcript_array): transcript = re.sub('[^\\p{L} \n\\d-]', '', transcript) else: transcript = re.sub('[^\\p{L} \n]', '', transcript) transcript = transcript.upper() dst.write(transcript) src.close() dst.close()
def __init__(self, necessary_paths={config.hidden_folder: ["tex_data", "cache", "log", "topics"]}): make_dirs_recursive(necessary_paths) self.G = nx.MultiDiGraph() for _from, _to, functional_object in ____CONVERTERS____: self.add_edge(_from, _to, functional_object) self.add_starred(_from, _to, functional_object, ____CONVERTERS____) functional_object.ant = self for (_froms1, _tos1, functional_object1), \ (_froms2, _tos2, functional_object2) \ in itertools.permutations(____CONVERTERS____, 2): for (_to1, _from1, _to2, _from2) in list_or_values(_tos1, _froms1, _tos2, _froms2): if _from1 == None: _from1 = OUT_OF_THE_BOX if _from2 == None: _from2 = OUT_OF_THE_BOX try: if match(_to1, _from2): self.add_edge(_to1, regex.sub(_from2 + '$', _to2, _to1), functional_object2) if match(_to2, _from1): self.add_edge(_to2, regex.sub(_from1 + '$', _to1, _to2), functional_object1) except Exception as e: logging.error(f"_to1 = {_to1}") logging.error( f"failing to compare {_to1} and {_to2} and {_from1} and {_from2} as regexes because {e}")
def fix_errors_in_citation(citation): """ Fix some common inconsistencies in the references such as double spaces. """ result = regex.sub(r"\s+", " ", citation) result = regex.sub(r"§(?=\d)", "§ ", result) result = regex.sub(r",\sbis\s", " bis ", result) return result
def extract( self, token: str, current_idx: int, relative_idx: int, tokens: Sequence[str], features: Dict[str, float], ): shape = regex.sub( UPPERCASE_RE, 'X', regex.sub(LOWERCASE_RE, 'x', re.sub(DIGIT_RE, '0', token))) features["shape[" + str(relative_idx) + "]=" + shape] = 1.0
def remove_common_sub(domains): """ Remove www. and m. subdomains """ pattern = re.compile(r"^(?>www\.|m\.)") domains = [re.sub(pattern, "", x, concurrent=True) for x in domains] return set(domains)
def add_whitespace_after_punctuation_marks(s: Text) -> Text: """ >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.') 'Живи еще хоть четверть века— Всё будет так. Исхода нет. ' """ return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ', s)
def delete_quote_links(text: str, tweet): if is_quote(tweet): text = regex.sub(get_tweet_url(tweet.quoted_status), '', text, flags=regex.IGNORECASE) return text
def tojson(self): return { 'title': regex.sub(' +', ' ', self.title.strip()), 'link': self.link, 'cover': self.imageUrl, 'details': self.details, 'screens': self.screens, 'links': self.links }
def extract_abp(content): """Extracts blocked and unblocked domains from ABP style content.""" pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1) pattern_supported_block = re.compile( r"^\|\|.+\^(?>$|.+(?:" r"\bfirst-party\b|" r"\b1p\b|" r"\bthird-party\b|" r"\b3p\b|" r"\bdocument\b|" r"\ball\b" # r"\ball\b|" # r"\bpopup\b" r"))", re.V1, ) pattern_scrub_blocked_list = [ r"^\|\|", r"\^($|.+(?>" r"\bfirst-party\b|" r"\b1p\b|" r"\bthird-party\b|" r"\b3p\b|\bdocument\b|" r"\ball\b|" r"\bpopup\b|" r"\S+))", ] pattern_scrub_blocked = re.compile( "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1 ) block_rules = [ x for x in content if re.match(pattern_supported_block, x, concurrent=True) and not re.match(pattern_unsupported, x, concurrent=True) ] blocked_domains = [ re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules ] blocked_domains = [x for x in blocked_domains if valid_domain(x)] pattern_supported_unblock = re.compile(r"@@\|\|.+\^$") unblock_rules = [ x for x in content if re.match(pattern_supported_unblock, x, concurrent=True) and not re.match(pattern_unsupported, x, concurrent=True) ] unblocked_domains = [ x.replace("@@||", "").replace("^", "").replace("$important", "") for x in unblock_rules ] regex_rules = [] return blocked_domains, unblocked_domains, unblock_rules, regex_rules
def concat_category(out_file): """Concatenate category README.md files""" files = glob(f"{DirPath.input}/*/*.md") files = sorted(files, key=lambda x: x) files = sorted(files, key=lambda x: x.__contains__("regional")) files = sorted(files, key=lambda x: x.__contains__("main"), reverse=True) for file in files: with open(file, encoding="utf-8") as file_input: with open(out_file, "a", encoding="utf-8") as file_output: lines = (re.sub(r"^#", r"##", x) if re.match(r"^#{0,6}+\s", x) else x for x in file_input) file_output.writelines(lines)
def fix_escape_characters(text: str): text = text.replace('&', '\&') text = text.replace('<', '\<') text = text.replace('>', '\>') # Escape Discord's markdown text = text.replace('`', '\`') text = text.replace('*', '\*') text = text.replace('~', '\~') # Special exception for underscore because Twitter user names may contain them text = regex.sub(r'(?<!@\S*)_', '\_', text) return text
def _parse_infobox(self, text, title): result = [] text = regex.sub(r'\n ?\|', '\n|', text) lines = text.split('\n|') for line in lines: date_in_text = self.find_date(line) if date_in_text: info = [x.strip() for x in line.split('=')] result.append( Index(token=title, date=date_in_text.date, info=info[0].replace('\n', ''))) return result
def on_edit(self, instance, value): if not value: if self.textinput: self.remove_widget(self.textinput) return unformatted_text = regex.sub(self.unformat_bbcode, "", self.text) self.textinput = t = SelectableLabel(text=unformatted_text, size_hint=(None, None), font_size=self.font_size, font_name=self.font_name, pos=self.pos, size=self.size, multiline=False) self.bind(pos=t.setter('pos'), size=t.setter('size')) self.add_widget(self.textinput) t.bind(on_text_validate=self.on_text_validate, focus=self.on_text_focus)
def bills(): data_dir = '../../lab1/data' for directory in os.listdir(data_dir): if directory.endswith('txt'): # print("directory: " + directory) bill = open(os.path.join(data_dir, directory), encoding='UTF-8').read() text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill) # print(text[:400]) r = regex.match( r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text) if r is None: yield bill, "", "", "", "f" else: yield bill, r.group("title"), r.group("journal_year"), r.group("position"), directory.split('.')[0]
def extract_hosts(content, list_type): """Extracts blocked or unblocked domains from hosts/domains style content.""" pattern_scrub = [ r"(?>\#|\!|\s+\#|\s+\!).*", r"^\s", r".*\blocalhost\b.*", r"^\d*\.\d*\.\d*\.\d*\s*(?>\s|www\.|m\.)", r"^(?>www\.|m\.)", ] pattern = re.compile("|".join(f"(?:{p})" for p in pattern_scrub), re.V1) domains = [re.sub(pattern, "", x, concurrent=True) for x in content] domains = [x for x in domains if valid_domain(x)] blocked_domains, unblocked_domains = [], [] if list_type == "unblock": unblocked_domains = domains if list_type == "block": blocked_domains = domains return blocked_domains, unblocked_domains
def replace_hashtag_with_link(text: str, hashtag_entities=None): if hashtag_entities is not None: hashtags_sorted = sorted(hashtag_entities, key=lambda x: x['indices'][0], reverse=True) for hashtag in hashtags_sorted: start, end = hashtag['indices'] # text[start] is either '#' or '#', so this preserves the original character used hashtag_text = text[start] + hashtag['text'] text = text[0:start] + get_named_link( hashtag_text, get_hashtag_url(hashtag_text)) + text[end:] else: hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text) for hashtag in hashtags: text = regex.sub( regex.escape(hashtag), fr'{get_named_link(hashtag, get_hashtag_url(hashtag))}', text) return text
def replace_mention_with_link(text: str, user_mentions_entities, in_reply_to_screen_name: str = None): if not user_mentions_entities: return text for mention in user_mentions_entities: mention_text = '@' + mention['screen_name'] if in_reply_to_screen_name and mention[ 'screen_name'] == in_reply_to_screen_name: text = regex.sub(regex.escape(mention_text), '', text, flags=regex.IGNORECASE) else: text = text.replace( mention_text, get_named_link( mention_text, get_profile_url(screen_name=mention['screen_name']))) return text
def parse_text(self): results = [] self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text) self.text = regex.sub(r'{\| class=\"wikitable.*\|}', repl="", string=self.text, flags=regex.DOTALL) self.text = regex.sub(r'{{[cC]ite.*}}', repl="", string=self.text, flags=regex.DOTALL) if self.paragraph_splitter(sep='== See also =='): pass elif self.paragraph_splitter(sep='==Notes=='): pass elif self.paragraph_splitter(sep='==References=='): pass elif self.paragraph_splitter(sep='== Bibliography =='): pass elif self.paragraph_splitter(sep='== External links =='): pass elif self.paragraph_splitter(sep='=== Sources ==='): pass sentences_reg = regex.finditer( r'(^| )[A-Z][^\.!?]{5,}[\.!?]', self.text) # possibly [A-Z][^\.!?]{5,}[\.!?] for performance for sentence_it in sentences_reg: sentence = sentence_it.group(0) date_in_text = self.find_date(sentence) if date_in_text: look_before = 60 look_after = 30 start = date_in_text.start - look_before if date_in_text.start >= look_before else 0 end = date_in_text.end + look_after if date_in_text.end + look_after < len( sentence) else len(sentence) if date_in_text.end + look_after > len(sentence): token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end) else: token = self.find_token( sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end) token_context = sentence[start:end] # token with full word at beginning i = start counter = 0 while True: i -= 1 counter += 1 if i < 0 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context = sentence[i + 1:start] + token_context break # token with full word at end i = end counter = 0 while True: i += 1 counter += 1 if i > len(sentence) - 1 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context += sentence[end:end + counter] break token_context = token_context.replace('\n', ' ') token_context = regex.sub(r'[^a-zA-Z1-9.!?:%$ ]', '', token_context) token_context = token_context.strip() results.append( Index(token=token if token else self.title, date=date_in_text.date, info=token_context)) return results
from typing import List, Callable, Text from regex import regex PUNCTUATION_MARKS_REGEX = r'\.,!?:;\"\-—' assert regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', PUNCTUATION_MARKS_REGEX.replace('\\', '')) == '' def add_whitespace_after_punctuation_marks(s: Text) -> Text: """ >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.') 'Живи еще хоть четверть века— Всё будет так. Исхода нет. ' """ return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ', s) def remove_punctuation_marks(s: Text) -> Text: """ >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"') 'Good morning gentlemen wordwordwordword Text' """ return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s) def keep_only_words(s: Text): # TODO: not working yet return regex.sub(rf'\W+|\S+', '', s) def remove_extra_whitespaces(s: Text) -> Text: """
def normalize(in_data): """Cleans the filterlist file.""" re.sub(r"\r", "", in_data) re.sub(r"\n+", "\n", in_data) return re.sub(checksum_pattern, "", in_data)
def keep_only_words(s: Text): # TODO: not working yet return regex.sub(rf'\W+|\S+', '', s)
def remove_punctuation_marks(s: Text) -> Text: """ >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"') 'Good morning gentlemen wordwordwordword Text' """ return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)
def format_filename(string): string = string.lower() filename = re.sub(r"[<>:\'\"\/\|?.*]", "", string) filename = filename.replace(" ", "_") return filename
data_dir = '../data' for directory in os.listdir(data_dir): if directory.endswith('txt'): # print("directory: " + directory) yield open(os.path.join(data_dir, directory), encoding='UTF-8').read() if __name__ == '__main__': b = {} for year in range(1900, 2500): b[str(year)] = {} for bill in bills(): text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill) # print(text[:400]) r = regex.match(r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text) # r = regex.match(r'\n*(Dz\.U\.z(?P<journal_year>\d+)r\.(N|n)r(?P<journal_number>\d+),?poz.(?P<position>\d+).?)?([a-żA-Ż\d\.\(\)]*\n?){0,4}\n*(ustawa|USTAWA|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\n*z\n?dnia\n?\d{1,2}\n?[a-żA-Ź]*\n?(?P<year>\d{4})\n?r\.\n*(?P<title>(.+\n)*?)\n*?(?P<title2>(.+\n)*?)\n*?(Rozdział(1|I)|Art.\n?(1|l)[^\d]|TYTUŁI|DziałI|częśćogólna)', text) # print(title.group()) position = r.group("position") year = r.group("journal_year") or r.group("year") b[year][position] = {} b[year][position]["counter"] = 0 b[year][position]["title"] = r.group("title") b[year][position]["journal_number"] = r.group("journal_number") b[year][position]["journal_year"] = r.group("journal_year") b[year][position]["year"] = r.group("year") b[year][position]["position"] = position
def parse_text(self): results = [] self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text) self.text = regex.sub(r'{\| class=\"wikitable.*\|}', repl="", string=self.text, flags=regex.DOTALL) self.text = regex.sub(r'{{[cC]ite.*}}', repl="", string=self.text, flags=regex.DOTALL) if self.paragraph_splitter(sep='== See also =='): pass elif self.paragraph_splitter(sep='==Notes=='): pass elif self.paragraph_splitter(sep='==References=='): pass elif self.paragraph_splitter(sep='== Bibliography =='): pass elif self.paragraph_splitter(sep='== External links =='): pass elif self.paragraph_splitter(sep='=== Sources ==='): pass sentences_reg = regex.finditer( r'(^| )[A-Z][^\.!?]{5,}[\.!?]', self.text) # possibly [A-Z][^\.!?]{5,}[\.!?] for performance for sentence_it in sentences_reg: sentence = sentence_it.group(0) date_in_text = self.find_date(sentence) if date_in_text: look_before = 60 look_after = 30 start = date_in_text.start - look_before if date_in_text.start >= look_before else 0 end = date_in_text.end + look_after if date_in_text.end + look_after < len( sentence) else len(sentence) # if date_in_text.end + look_after > len(sentence): # token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end) # else: # token = self.find_token(sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end) token_context = sentence[start:end] # token with full word at beginning i = start counter = 0 while True: i -= 1 counter += 1 if i < 0 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context = sentence[i + 1:start] + token_context break # token with full word at end i = end counter = 0 while True: i += 1 counter += 1 if i > len(sentence) - 1 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context += sentence[end:end + counter] break token_context = token_context.replace('\n', ' ') token_context = regex.sub(r'[^a-zA-Z0-9.!?:%$;, ]', '', token_context) token_context = token_context.strip() results.append( Index(token=self.title, date=date_in_text.date, info=token_context)) # I couldnt find best word that explain the purpose, often the result was meaningful, therefore I # decided not to use it. # tokenized = nltk.pos_tag(nltk.word_tokenize(sentence)) # # proper_nouns = [] # nouns = [] # for (word, pos) in tokenized: # if pos == 'NNP': # proper_nouns.append(word) # elif pos == 'NN': # nouns.append(word) # # results.append(Index(token=proper_nouns[0] if proper_nouns else "title", date=date_in_text.date, info=proper_nouns[1] if # len(proper_nouns) > 1 else nouns[0] if nouns else "")) return results
def split_citation_part(string: str): """ A string a tokenizes. Tokens are identified as units or values. Pairs are built to connect the units with their respective values. If the unit cannot be indentified (and must be inferred later) None is returned. Args: string: A string that is part of a reference and cites *one* part a statute. Retruns: As a generator tuples are returned, each containing the unit (or None) and the respecive value. """ # Tokenization # fmt: off string = regex.sub( r"(" r"\d+(?>\.\d+)?[a-z]?|" r"\b[ivx]+|" r"\b[a-z]\)?" r")" r"(\sff?\.|\sff\b)", r"\1ff.", string, flags=regex.IGNORECASE, ) # fmt: on tokens = split_unit_number_pattern.split(string, ) # Building pairs of units with their resp. values while len(tokens) > 0: token = tokens.pop(0) if StatutesParser.is_unit(token): if len(tokens) > 0: unit = StatutesParser.stem_unit(token) token = tokens.pop(0) numb = token assert StatutesParser.is_numb(numb), numb else: # when citation ends with unit print( f"Citation {string} ends with unit {token}. Ignoring last unit." ) break elif StatutesParser.is_pre_numb(token): numb = token token = tokens.pop(0) if not StatutesParser.is_unit(token): print(token, "is not a unit in", string) continue # to fix citation "§ 30 DRITTER ABSCHNITT" # Last part in now ignored, # but reference areas can still be improved. unit = StatutesParser.stem_unit(token) elif StatutesParser.is_numb(token): unit = None numb = token else: raise StringCaseException(token, "in", string) numb = regex.sub(r"(ff?\.|ff|\))$", "", numb) yield [unit, numb]
def remove_extra_whitespaces(s: Text) -> Text: """ >>> remove_extra_whitespaces(' Out on the tar plains The glides are moving ') ' Out on the tar plains The glides are moving ' """ return regex.sub(r'\s{2,}', ' ', s)
def zeroize(sample): return "\n".join([regex.sub(CONLL_LINE, r"\1 \2 O O", line) for line in sample.split('\n')])
def clean(self, text: str) -> str: text = re.sub(r'\s+', ' ', text) text = "".join([c for c in text if c in self.allowed_chars]) text = re.sub(r'\s+', ' ', text) return text.strip()