Exemplo n.º 1
0
    def __call__(self, i_url):
        for url, m in i_url:
            if url.startswith('http') and regex.match(self.http_regex, url):
                path = f"{config.hidden_folder}/pdfs/{urllib.parse.quote_plus(url)}.pdf"
                path = path.replace("(", "")
                path = path.replace(")", "")


                if not os.path.exists(path):
                    os.system(f"chromium  --headless \
                                          --disable-gpu \
                                          --disable-translate \
                                          --disable-extensions \
                                          --disable-background-networking \
                                          --safebrowsing-disable-auto-update \
                                          --disable-sync \
                                          --metrics-recording-only \
                                          --disable-default-apps \
                                          --no-first-run \
                                          --mute-audio \
                                          --hide-scrollbars \
                                          --disable-software-rasterizer "
                                        f"--print-to-pdf={path} {url}")
                yield path, m
            elif os.path.exists(url) and regex.match(self.file_regex, url)  is not None:
                yield url, m
            else:
                logging.error(f"{url} is not a valid url/path")
Exemplo n.º 2
0
    def get_suffix_and_law_name(self, string: str):
        """
        Returns: A tuple containing length of

            1. the article between numbers and law name (eg. " der ")
            2. length of name of law as in the given string
            3. The type of the reference.

            If not found lengths are 0.
        """
        suffix_match = regex.match(r"^,?\s+?de[sr]\s+", string)

        if suffix_match:

            suffix_len = suffix_match.end()
            law_test = string[suffix_len:suffix_len + 1000]

            dict_suffix_len = self.get_dict_law_name_len(law_test)
            if dict_suffix_len:
                return suffix_len, dict_suffix_len, "dict"

            sgb_suffix_len = self.get_sgb_law_name_len(law_test)
            if sgb_suffix_len:
                return suffix_len, sgb_suffix_len, "sgb"

            eu_suffix_len = self.get_eu_law_name_len(law_test)
            if eu_suffix_len:
                return suffix_len, eu_suffix_len, "eu"

            ignore_suffix_len = self.get_ignore_law_name_len(law_test)
            if ignore_suffix_len:
                return suffix_len, ignore_suffix_len, "ignore"

            return suffix_len, 0, "unknown"

        else:  # no der/des suffix
            suffix_match = regex.match(r"^[\s\n]+", string[:1000])
            if suffix_match:
                suffix_len = len(suffix_match[0])
                law_test = string[suffix_len:1000]

                dict_suffix_len = self.get_dict_law_name_len(law_test)
                if dict_suffix_len:
                    return suffix_len, dict_suffix_len, "dict"

                sgb_suffix_len = self.get_sgb_law_name_len(law_test)
                if sgb_suffix_len:
                    return suffix_len, sgb_suffix_len, "sgb"

                ignore_no_suffix_len = self.get_no_suffix_ignore_law_name_len(
                    law_test)
                if ignore_no_suffix_len:
                    return suffix_len, ignore_no_suffix_len, "ignore"

            return 0, 0, "internal"
Exemplo n.º 3
0
def extract_abp(content):
    """Extracts blocked and unblocked domains from ABP style content."""
    pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1)
    pattern_supported_block = re.compile(
        r"^\|\|.+\^(?>$|.+(?:"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|"
        r"\bdocument\b|"
        r"\ball\b"
        # r"\ball\b|"
        # r"\bpopup\b"
        r"))",
        re.V1,
    )
    pattern_scrub_blocked_list = [
        r"^\|\|",
        r"\^($|.+(?>"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|\bdocument\b|"
        r"\ball\b|"
        r"\bpopup\b|"
        r"\S+))",
    ]
    pattern_scrub_blocked = re.compile(
        "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1
    )
    block_rules = [
        x
        for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]

    blocked_domains = [
        re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules
    ]
    blocked_domains = [x for x in blocked_domains if valid_domain(x)]
    pattern_supported_unblock = re.compile(r"@@\|\|.+\^$")
    unblock_rules = [
        x
        for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]
    unblocked_domains = [
        x.replace("@@||", "").replace("^", "").replace("$important", "")
        for x in unblock_rules
    ]
    regex_rules = []
    return blocked_domains, unblocked_domains, unblock_rules, regex_rules
Exemplo n.º 4
0
def find_references(soup, pattern, attrs):
    """
    Finds the references in the soup and marks them a tag
    """
    logs = []  # For debug

    text_tags = list(soup.find_all("text"))
    for text_tag in text_tags:
        for text_tag_string in list(text_tag.contents):
            if type(text_tag_string) is not bs4.element.NavigableString:
                continue
            tag_cursor = text_tag_string
            last_match_end = 0
            matches = pattern.finditer(text_tag_string)
            for match in list(matches):
                if regex.match(r"\s?,?of\b", text_tag_string[match.end():]):
                    continue
                ref_tag = soup.new_tag("reference", **attrs)
                pre_text, ref_tag, post_text = add_tag(text_tag_string,
                                                       match.start(),
                                                       match.end(), ref_tag)

                pre_text = pre_text[last_match_end:]
                last_match_end = match.end()

                tag_cursor.replace_with(ref_tag)
                ref_tag.insert_before(pre_text)
                ref_tag.insert_after(post_text)
                tag_cursor = post_text

                logs.append(f"{post_text[:50]} --- {match[0]}")  # For debug

    return logs  # For debug
Exemplo n.º 5
0
def worker_unmatched_item(item, pattern):
    """Worker for remove_redundant via ThreadPoolExecutor
    to get unmatched subdomains from subdomains.
    """
    if not re.match(pattern, item, concurrent=True):
        return item
    return None
Exemplo n.º 6
0
    async def show(self, ctx, url):
        result = regex.match(self.URL_REGEX, url)
        if result:
            url = result.group(0)
        else:
            raise commands.BadArgument("Invalid Instagram URL.")

        await self.show_media(ctx, url)
Exemplo n.º 7
0
def extract_rules(content):
    pattern_supported_block = re.compile(r"^\|\|.+(\^|\^\$important)$")
    block_rules = [
        x for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
    ]
    pattern_supported_unblock = re.compile(r"^@@.+(\^(\$important)?|\/)$")
    unblock_rules = [
        x for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
    ]
    pattern_supported_regex = re.compile(r"^\/.*\/$")
    regex_rules = [
        x for x in content
        if re.match(pattern_supported_regex, x, concurrent=True)
    ]
    return block_rules, unblock_rules, regex_rules
Exemplo n.º 8
0
def extract_regex(content):
    """
    Extracts regex rules within two '/'.
    """
    pattern_if_regexp = re.compile(r"^\/.*\/$", re.V1)
    regex_rules = [
        x for x in content if re.match(pattern_if_regexp, x, concurrent=True)
    ]
    return regex_rules
Exemplo n.º 9
0
def check_boxes(text_box: str, pattern: str):
    """
    Used for Validation of REGEX Text.

    :param text_box: Plain Text to validate.
    :param pattern: Regex Pattern for validation.

    :return: True if test_box is valid.
    """
    return regex.match(pattern, text_box)
Exemplo n.º 10
0
    def is_valid(self) -> bool:
        """
        :return Whether this variable has valid properties in order for it to work properly with the
                Terraform API.
        """

        key_pattern = r"^[a-zA-Z0-9_-]+$"
        key_valid = regex.match(key_pattern, self.key) is not None
        category_valid = self.category in ["terraform", "env"]
        return key_valid and category_valid
Exemplo n.º 11
0
def _extract_timestamps(video_id, content, word_to_extract):
    logger.info(
        f"Extract timestamps where the word {word_to_extract} is pronounced",
        prefix=f"{video_id} >> ")

    pattern = r"<(\d{2}:\d{2}:\d{2}.\d{3})>([^<]+)<(\d{2}:\d{2}:\d{2}.\d{3})>"
    res = [(start, word.lower().strip(), end) for start, word, end in
           regex.findall(pattern, content, overlapped=True)
           if regex.match(word_to_extract,
                          word.lower().strip())]
    logger.debug(f"Extracted {len(res)} words")
    return res
Exemplo n.º 12
0
def concat_category(out_file):
    """Concatenate category README.md files"""
    files = glob(f"{DirPath.input}/*/*.md")
    files = sorted(files, key=lambda x: x)
    files = sorted(files, key=lambda x: x.__contains__("regional"))
    files = sorted(files, key=lambda x: x.__contains__("main"), reverse=True)
    for file in files:
        with open(file, encoding="utf-8") as file_input:
            with open(out_file, "a", encoding="utf-8") as file_output:
                lines = (re.sub(r"^#", r"##", x)
                         if re.match(r"^#{0,6}+\s", x) else x
                         for x in file_input)
                file_output.writelines(lines)
Exemplo n.º 13
0
def bills():
    data_dir = '../../lab1/data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)

            bill = open(os.path.join(data_dir, directory), encoding='UTF-8').read()
            text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
            # print(text[:400])

            r = regex.match(
                r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)',
                text)

            if r is None:
                yield bill, "", "", "", "f"
            else:
                yield bill, r.group("title"), r.group("journal_year"), r.group("position"), directory.split('.')[0]
Exemplo n.º 14
0
            # print("directory: " + directory)
            yield open(os.path.join(data_dir, directory), encoding='UTF-8').read()



if __name__ == '__main__':
    b = {}
    for year in range(1900, 2500):
        b[str(year)] = {}

    for bill in bills():

        text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
        # print(text[:400])

        r = regex.match(r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text)
        # r = regex.match(r'\n*(Dz\.U\.z(?P<journal_year>\d+)r\.(N|n)r(?P<journal_number>\d+),?poz.(?P<position>\d+).?)?([a-żA-Ż\d\.\(\)]*\n?){0,4}\n*(ustawa|USTAWA|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\n*z\n?dnia\n?\d{1,2}\n?[a-żA-Ź]*\n?(?P<year>\d{4})\n?r\.\n*(?P<title>(.+\n)*?)\n*?(?P<title2>(.+\n)*?)\n*?(Rozdział(1|I)|Art.\n?(1|l)[^\d]|TYTUŁI|DziałI|częśćogólna)', text)
        # print(title.group())

        position = r.group("position")
        year = r.group("journal_year") or r.group("year")
        b[year][position] = {}
        b[year][position]["counter"] = 0
        b[year][position]["title"] = r.group("title")
        b[year][position]["journal_number"] = r.group("journal_number")
        b[year][position]["journal_year"] = r.group("journal_year")
        b[year][position]["year"] = r.group("year")
        b[year][position]["position"] = position

    counter = 0