示例#1
0
    def get_dict_law_name_len(self, test_str):
        """
        Determines if the test_str starts with a law name given with self.laws_lookup.

        Returns: The length matched law name or 0.
        """

        # Stem the test_str as the law names are already stemmed
        test_str_stem = stem_law_name(test_str)

        # Look for matching law names
        match = self.match_law_name(test_str_stem)
        if not match:
            return 0

        # Transpose the area of the matched law name in the stemmed text to the
        # original text by splitting the original and the raw text into words (tokens)
        # and define the area of the original string that it contains of the same number
        # of tokens as the matched area in the stemmed string.
        test_str_splitted = regex.findall(r"[\w']+|[\W']+", test_str)
        match_splitted = regex.findall(r"[\w']+|[\W']+", match)
        match_raw = "".join(test_str_splitted[:len(match_splitted)])
        assert len(test_str_splitted[0].strip()) > 0, (match, test_str,
                                                       test_str_stem)

        # If last matched word of law name does continue after match with
        # a string that would not be stemmed, return no match
        # TODO look for other matches before returning no match
        last_word_test_stemmed = stem_law_name(
            test_str_splitted[len(match_splitted) - 1])
        last_word_match = match_splitted[-1]
        if last_word_match != last_word_test_stemmed:
            return 0

        return len(match_raw)
示例#2
0
def match_regex(domains, regex_rules):
    """
    Match domains against regex
    """
    regex_list = [x[1:-1] for x in regex_rules]
    pattern = re.compile("|".join(regex_list))
    matches = [x for x in domains if re.findall(pattern, x, concurrent=True)]
    return matches
示例#3
0
def _extract_time(video_id, content):
    logger.info("Extract time of the video", prefix=f"{video_id} >> ")

    # This is an approximation of the video length based on the last timestamp
    # TODO: Find a way to get the real video length
    #       without having to download the video clip
    pattern = r"\d{2}:\d{2}:\d{2}.\d{3}"
    res = [
        match for match in regex.findall(pattern, content, overlapped=True)
    ][-1]
    logger.debug(f"Extracted time of {res}")
    return res
示例#4
0
def _extract_timestamps(video_id, content, word_to_extract):
    logger.info(
        f"Extract timestamps where the word {word_to_extract} is pronounced",
        prefix=f"{video_id} >> ")

    pattern = r"<(\d{2}:\d{2}:\d{2}.\d{3})>([^<]+)<(\d{2}:\d{2}:\d{2}.\d{3})>"
    res = [(start, word.lower().strip(), end) for start, word, end in
           regex.findall(pattern, content, overlapped=True)
           if regex.match(word_to_extract,
                          word.lower().strip())]
    logger.debug(f"Extracted {len(res)} words")
    return res
def get_hashtags(text: str):
    hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text)
    hashtags.sort(key=lambda x: len(x))

    idx_hashtag_map = {}

    for hashtag in hashtags:
        indices = [
            m.start(0) for m in regex.finditer(regex.escape(hashtag), text)
        ]
        for idx in indices:
            idx_hashtag_map[idx] = hashtag

    return idx_hashtag_map.items()
def get_mentions(text: str):
    mentions = regex.findall(r'(?:[@|@])[^\d\W][\w]*', text)
    mentions.sort(key=lambda x: len(x))

    idx_mention_map = {}

    for mention in mentions:
        indices = [
            m.start(0) for m in regex.finditer(regex.escape(mention), text)
        ]
        for idx in indices:
            idx_mention_map[idx] = mention

    return idx_mention_map.items()
示例#7
0
def replace_hashtag_with_link(text: str, hashtag_entities=None):
    if hashtag_entities is not None:
        hashtags_sorted = sorted(hashtag_entities,
                                 key=lambda x: x['indices'][0],
                                 reverse=True)

        for hashtag in hashtags_sorted:
            start, end = hashtag['indices']

            # text[start] is either '#' or '#', so this preserves the original character used
            hashtag_text = text[start] + hashtag['text']
            text = text[0:start] + get_named_link(
                hashtag_text, get_hashtag_url(hashtag_text)) + text[end:]
    else:
        hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text)
        for hashtag in hashtags:
            text = regex.sub(
                regex.escape(hashtag),
                fr'{get_named_link(hashtag, get_hashtag_url(hashtag))}', text)

    return text
示例#8
0
        b[year][position]["year"] = r.group("year")
        b[year][position]["position"] = position

    counter = 0


    current_year = 0
    current_text = ''
    current_number = ''
    current_number_text = ''
    errors = 0

    for bill in bills():
        text = regex.sub(r"[ \t\r\f\v\n ]*", "", bill)
        r = regex.findall(
            r'(.{30}(?P<year>\d{4})r\.?.{30}|.{30}poz\.(?P<position>\d+).{30}|.{30}(N|n)r(?P<number>\d+).{30})',
            text, overlapped=True)

        for match in r:
            if match[2] != '':
                position = match[2]
                try:
                    b[current_year][position]["counter"] += 1
                    if current_number != b[current_year][position]["journal_number"] and int(current_year) < 2012:
                        # print("error")
                        # print(match)
                        # print(current_year)
                        # print(current_text)
                        # print(position)
                        # print(b[current_year][position]["journal_number"])
                        b[current_year][position]["journal_number"] = current_number
示例#9
0
def tokenize(input_text: str) -> list[str]:
    for token in re.findall(SPLITTING_REGEX, input_text):
        token = "".join(TOKEN_UNICODEIFIER[b] for b in token.encode("utf-8"))
        for bpe_token in break_token(token).split(" "):
            yield bpe_token
示例#10
0
def process_bill(article_number, bill_number, bill_text):

    if 'wprowadza się następujące zmiany' not in bill_text and 'dodaje się art' not in bill_text and 'racą moc' not in bill_text:
        r = regex.findall(
            r'(art\.(\d+\w?(-\s*\d+\w?)?(?!\))|pkt|ust\.|§|\si\s|oraz|,|\s|z\szastrzeżeniem)*)',
            bill_text,
            multiline=True)
        # r = regex.findall(r'art\.', bill_text, multiline=True)
        for el in r:

            current_choice = None
            current_number = 0
            current_bill = 0
            current_article = 0

            reference = el[0]
            # print(reference)

            match = regex.findall(r'(\d+|-\d+|art\.|ust\.|§|pkt)', reference)
            for el in match:
                if el == 'art.':
                    current_choice = 'art'
                elif el == 'ust.' or el == '§':
                    current_choice = 'ust'
                elif el.startswith('-'):
                    if is_int(el[1:]):
                        i = int(el[1:])
                        if current_choice == 'art':
                            for it in range(current_number + 1, i + 1):
                                article_references[it] += 1
                        elif current_choice == 'ust':
                            for it in range(current_number + 1, i + 1):
                                bill_references[(current_article, it)] += 1
                elif el == 'pkt':
                    current_choice = 'pkt'
                elif is_int(el):
                    current_number = int(el)
                    if current_choice == 'art':
                        current_article = int(el)
                        article_references[current_article] += 1
                    elif current_choice == 'ust':
                        bill_number = int(el)
                        bill_references[(current_article, bill_number)] += 1
            # print(match)
            # print("\n\n")

        r = regex.findall(
            r'((art\.)?(\d+\w?(-\s*\d+\w?)?(?!\))|pkt|ust\.|§|\si\s|oraz|,|\s|z\szastrzeżeniem)*ust\.\s*\d+(\d+\w?(-\s*\d+\w?)?(?!\))|pkt|ust\.|§|\si\s|oraz|,|\s|z\szastrzeżeniem)*)',
            bill_text,
            multiline=True)
        for el2 in r:
            if 'art.' not in el2[0]:
                current_choice = None
                current_number = 0
                current_bill = 0
                current_article = article_number

                reference = el2[0]
                print(reference)
                print(article_number)

                match = regex.findall(r'(\d+|-\d+|art\.|ust\.|§|pkt)',
                                      reference)
                for el in match:
                    if el == 'ust.' or el == '§':
                        current_choice = 'ust'
                    elif el.startswith('-'):
                        if is_int(el[1:]):
                            i = int(el[1:])
                            if current_choice == 'ust':
                                for it in range(current_number + 1, i + 1):
                                    bill_references[(article_number, it)] += 1
                    elif el == 'pkt':
                        current_choice = 'pkt'
                    elif is_int(el):
                        current_number = int(el)
                        if current_choice == 'ust':
                            bill_number = int(el)
                            bill_references[(article_number, bill_number)] += 1
                print(match)
                print("\n\n")