def get_dict_law_name_len(self, test_str): """ Determines if the test_str starts with a law name given with self.laws_lookup. Returns: The length matched law name or 0. """ # Stem the test_str as the law names are already stemmed test_str_stem = stem_law_name(test_str) # Look for matching law names match = self.match_law_name(test_str_stem) if not match: return 0 # Transpose the area of the matched law name in the stemmed text to the # original text by splitting the original and the raw text into words (tokens) # and define the area of the original string that it contains of the same number # of tokens as the matched area in the stemmed string. test_str_splitted = regex.findall(r"[\w']+|[\W']+", test_str) match_splitted = regex.findall(r"[\w']+|[\W']+", match) match_raw = "".join(test_str_splitted[:len(match_splitted)]) assert len(test_str_splitted[0].strip()) > 0, (match, test_str, test_str_stem) # If last matched word of law name does continue after match with # a string that would not be stemmed, return no match # TODO look for other matches before returning no match last_word_test_stemmed = stem_law_name( test_str_splitted[len(match_splitted) - 1]) last_word_match = match_splitted[-1] if last_word_match != last_word_test_stemmed: return 0 return len(match_raw)
def match_regex(domains, regex_rules): """ Match domains against regex """ regex_list = [x[1:-1] for x in regex_rules] pattern = re.compile("|".join(regex_list)) matches = [x for x in domains if re.findall(pattern, x, concurrent=True)] return matches
def _extract_time(video_id, content): logger.info("Extract time of the video", prefix=f"{video_id} >> ") # This is an approximation of the video length based on the last timestamp # TODO: Find a way to get the real video length # without having to download the video clip pattern = r"\d{2}:\d{2}:\d{2}.\d{3}" res = [ match for match in regex.findall(pattern, content, overlapped=True) ][-1] logger.debug(f"Extracted time of {res}") return res
def _extract_timestamps(video_id, content, word_to_extract): logger.info( f"Extract timestamps where the word {word_to_extract} is pronounced", prefix=f"{video_id} >> ") pattern = r"<(\d{2}:\d{2}:\d{2}.\d{3})>([^<]+)<(\d{2}:\d{2}:\d{2}.\d{3})>" res = [(start, word.lower().strip(), end) for start, word, end in regex.findall(pattern, content, overlapped=True) if regex.match(word_to_extract, word.lower().strip())] logger.debug(f"Extracted {len(res)} words") return res
def get_hashtags(text: str): hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text) hashtags.sort(key=lambda x: len(x)) idx_hashtag_map = {} for hashtag in hashtags: indices = [ m.start(0) for m in regex.finditer(regex.escape(hashtag), text) ] for idx in indices: idx_hashtag_map[idx] = hashtag return idx_hashtag_map.items()
def get_mentions(text: str): mentions = regex.findall(r'(?:[@|@])[^\d\W][\w]*', text) mentions.sort(key=lambda x: len(x)) idx_mention_map = {} for mention in mentions: indices = [ m.start(0) for m in regex.finditer(regex.escape(mention), text) ] for idx in indices: idx_mention_map[idx] = mention return idx_mention_map.items()
def replace_hashtag_with_link(text: str, hashtag_entities=None): if hashtag_entities is not None: hashtags_sorted = sorted(hashtag_entities, key=lambda x: x['indices'][0], reverse=True) for hashtag in hashtags_sorted: start, end = hashtag['indices'] # text[start] is either '#' or '#', so this preserves the original character used hashtag_text = text[start] + hashtag['text'] text = text[0:start] + get_named_link( hashtag_text, get_hashtag_url(hashtag_text)) + text[end:] else: hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text) for hashtag in hashtags: text = regex.sub( regex.escape(hashtag), fr'{get_named_link(hashtag, get_hashtag_url(hashtag))}', text) return text
b[year][position]["year"] = r.group("year") b[year][position]["position"] = position counter = 0 current_year = 0 current_text = '' current_number = '' current_number_text = '' errors = 0 for bill in bills(): text = regex.sub(r"[ \t\r\f\v\n ]*", "", bill) r = regex.findall( r'(.{30}(?P<year>\d{4})r\.?.{30}|.{30}poz\.(?P<position>\d+).{30}|.{30}(N|n)r(?P<number>\d+).{30})', text, overlapped=True) for match in r: if match[2] != '': position = match[2] try: b[current_year][position]["counter"] += 1 if current_number != b[current_year][position]["journal_number"] and int(current_year) < 2012: # print("error") # print(match) # print(current_year) # print(current_text) # print(position) # print(b[current_year][position]["journal_number"]) b[current_year][position]["journal_number"] = current_number
def tokenize(input_text: str) -> list[str]: for token in re.findall(SPLITTING_REGEX, input_text): token = "".join(TOKEN_UNICODEIFIER[b] for b in token.encode("utf-8")) for bpe_token in break_token(token).split(" "): yield bpe_token
def process_bill(article_number, bill_number, bill_text): if 'wprowadza się następujące zmiany' not in bill_text and 'dodaje się art' not in bill_text and 'racą moc' not in bill_text: r = regex.findall( r'(art\.(\d+\w?(-\s*\d+\w?)?(?!\))|pkt|ust\.|§|\si\s|oraz|,|\s|z\szastrzeżeniem)*)', bill_text, multiline=True) # r = regex.findall(r'art\.', bill_text, multiline=True) for el in r: current_choice = None current_number = 0 current_bill = 0 current_article = 0 reference = el[0] # print(reference) match = regex.findall(r'(\d+|-\d+|art\.|ust\.|§|pkt)', reference) for el in match: if el == 'art.': current_choice = 'art' elif el == 'ust.' or el == '§': current_choice = 'ust' elif el.startswith('-'): if is_int(el[1:]): i = int(el[1:]) if current_choice == 'art': for it in range(current_number + 1, i + 1): article_references[it] += 1 elif current_choice == 'ust': for it in range(current_number + 1, i + 1): bill_references[(current_article, it)] += 1 elif el == 'pkt': current_choice = 'pkt' elif is_int(el): current_number = int(el) if current_choice == 'art': current_article = int(el) article_references[current_article] += 1 elif current_choice == 'ust': bill_number = int(el) bill_references[(current_article, bill_number)] += 1 # print(match) # print("\n\n") r = regex.findall( r'((art\.)?(\d+\w?(-\s*\d+\w?)?(?!\))|pkt|ust\.|§|\si\s|oraz|,|\s|z\szastrzeżeniem)*ust\.\s*\d+(\d+\w?(-\s*\d+\w?)?(?!\))|pkt|ust\.|§|\si\s|oraz|,|\s|z\szastrzeżeniem)*)', bill_text, multiline=True) for el2 in r: if 'art.' not in el2[0]: current_choice = None current_number = 0 current_bill = 0 current_article = article_number reference = el2[0] print(reference) print(article_number) match = regex.findall(r'(\d+|-\d+|art\.|ust\.|§|pkt)', reference) for el in match: if el == 'ust.' or el == '§': current_choice = 'ust' elif el.startswith('-'): if is_int(el[1:]): i = int(el[1:]) if current_choice == 'ust': for it in range(current_number + 1, i + 1): bill_references[(article_number, it)] += 1 elif el == 'pkt': current_choice = 'pkt' elif is_int(el): current_number = int(el) if current_choice == 'ust': bill_number = int(el) bill_references[(article_number, bill_number)] += 1 print(match) print("\n\n")