示例#1
0
    def parse_page(self, page):
        title = regex.search(r'(?<=<title>).*(?=<\/title>)', page).group(0)
        text = regex.search(r'(?<=<text).*(?=<\/text>)',
                            page,
                            flags=regex.DOTALL).group(0)
        infobox = None

        infobox_regex = regex.search(r'(?=\{Infobox)(\{([^{}]|(?1))*\})', text)
        text_start_index = 0
        if infobox_regex:
            text_start_index = infobox_regex.end()
            infobox = infobox_regex.groups(0)[0]

        page = Page(title, infobox, text[text_start_index:])
        return page.get_parsed_date_tokens()
示例#2
0
 def get_lang(string):
     # TODO using nltk detection
     has_cyrillic = regex.search(r'\p{IsCyrillic}', string)
     if has_cyrillic:
         return "russian"
     else:
         return "english"
示例#3
0
 def parse_number_of_jobs_found(input_html: str) -> int:
     """
     Im oberen Teil der Ergebnisseite wird angegeben, wie viele Postings für die Suchanfrage gefunden wurden.
     Diese Information wird durch diese Funktion geparst.
     :param input_html: HTML String der Ergebnisseite
     :return: Anzahl der gefundenen Job Postings
     """
     return int(
         re.search(r"(?<=\()[][0-9]{2,4}(?= Jobs gefunden\))",
                   input_html).group())
示例#4
0
    async def event_message(self, ctx):
        if ctx.author.name.lower() == self.nick.lower():
            # Commands only the the bot user
            await self.handle_commands(ctx)

        if self.active and ctx.author.name.lower() == self.target_user.lower():
            # reply logic only for the targeted user
            for rule in self.rules:
                if regex.search(rule.get('pattern'), ctx.content):
                    time.sleep(0.5)
                    await ctx.channel.send(rule.get('reply'))
示例#5
0
    def find_date(text):
        date = regex.search(
            r'(( [a-zA-Z]{3,8}|\d{1,2})[ ]\d{1,2}([ ]|(\,? ))\d{1,4})|(([a-zA-Z]{3,8}|in) \d{4})(?<=[^\}])',
            text)

        if date:
            start = date.start()
            end = date.end()
            date = date.group(0)

            return DateInText(date, start, end)

        return None
示例#6
0
def get_data_from_row(row):
    try:
        text = row.text
    except AttributeError:
        return
    match = re.search(r"(.+) (\(.+\)) (\(.+\))", text)
    try:
        name = match.group(1).strip()
        year = match.group(2).strip("()")
        cat = match.group(3).strip("()")
    except AttributeError:
        return
    if "series" not in cat.lower():
        return
    # print(name, year, cat)
    try:
        link = row.select_one(".result_text a").get("href").strip()
        link_match = re.search(r"title\/(tt\d+)", link)
        show_id = link_match.group(1)
    except AttributeError:
        return
    return dict(showname=name, category=cat, year=year, id=show_id)
示例#7
0
 def __merge_links(self, links):
     """
     Fügt links nur dann der Objekt-liste hinzu, wenn ihre Job-Id noch nicht im job_id set vorhanden ist.
     :param links: Liste an URLs als Strings
     """
     for link in links:
         try:
             job_id = re.search(r"[[:xdigit:]-]{9,}", link).group()
             if job_id not in self.job_ids:
                 self.links.append(link)
                 self.job_ids.add(job_id)
         except TypeError:
             pass
示例#8
0
    def parse_page(self, page):
        tree = ET.fromstring(page)
        title = tree.find('title').text
        text = tree.find('revision').find('text').text
        infobox = None

        infobox_regex = regex.search(r'(?=\{Infobox)(\{([^{}]|(?1))*\})', page)
        text_start_index = 0
        if infobox_regex:
            text_start_index = infobox_regex.end()
            infobox = infobox_regex.groups(0)[0]

        page = Page(title, infobox, text[text_start_index:])
        return page.get_parsed_date_tokens()
 def __vectorize(self, sentences, training=False):
     sentenceKeys = list(sentences[0].keys())
     for sentence in sentences:
         for name in sentenceKeys:
             if name == "NER_IOBX":
                 continue
             if name.startswith("NER_"):
                 if name not in self.__mappings and training:
                     self.__mappings[name] = {"O": 1}
                     if self.__special_labels:
                         self.__mappings[name]["[CLS]"] = len(
                             self.__mappings[name]) + 1
                         self.__mappings[name]["[SEP]"] = len(
                             self.__mappings[name]) + 1
                 for (id_, item) in enumerate(sentence[name]):
                     if item not in self.__mappings[name]:
                         if training:
                             self.__mappings[name][item] = len(
                                 self.__mappings[name]) + 1
                         else:
                             print(f"Issue with the label {item} in {name}")
                             exit(1)
                     sentence[name][id_] = self.__mappings[name][item]
             if name == "tokens" and training and self.__tokenizer is not None:
                 for token in sentence["tokens"]:
                     bert_tokens = self.__tokenizer.tokenize(token)
                     if self.__validateBertTokens(
                             token, bert_tokens
                     ) == 1 and token not in self.__add_tokens:
                         if regex.search("\p{P}|\p{S}", token):
                             new_tokens = list(
                                 filter(None,
                                        regex.split("(\p{P}|\p{S})",
                                                    token)))
                             for sub_token in new_tokens:
                                 bert_tokens = self.__tokenizer.tokenize(
                                     sub_token)
                                 if self.__validateBertTokens(
                                         sub_token, bert_tokens) == 1:
                                     self.__add_tokens.add(sub_token)
                         else:
                             self.__add_tokens.add(token)
示例#10
0
 def _get_additional_details(details):
     remove_whitespace = lambda x: x.strip()
     additional_data = list(map(remove_whitespace, details.text.split("|")))
     maturity = ""
     if len(additional_data) == 4:
         maturity = additional_data[0]
         additional_data = additional_data[1:]
     try:
         ep_time, tags, date = additional_data
     except:
         return {}
     else:
         result = re.search(r".+\((.+)\)", date)
         date = result.group(1)
     tags = list(map(remove_whitespace, tags.split(",")))
     return {
         "tags": tags,
         "time_per_episode": ep_time,
         "running_date": date,
         "maturity": maturity,
     }
示例#11
0
    def compiles(self, tex_file_path, n=1, clean=False):
        path, filename, extension, filename_without_extension = get_path_filename_extension(
            tex_file_path)

        if clean:
            subprocess.run(['rm', '*.pdf.html'],
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL)
            subprocess.run(['rm', '*.pdf'],
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL)
            subprocess.run(['rm', '*.aux'],
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL)
            subprocess.run(['rm', '*.log'],
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL)

        for i in range(n):
            print(f"trying to compile {path} + {filename}")
            process = subprocess.Popen(
                f'cd {path}  && echo $(pwd) && pdflatex -interaction=nonstopmode -halt-on-error -file-line-error {filename}',
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                shell=True)
            time.sleep(self.timeout_sec)
            process.send_signal(signal.SIGINT)
            output = process.stdout.read().decode('utf-8', errors="ignore")
            #print(output)
            errors = process.stderr.read().decode('utf-8', errors="ignore")
            #print(errors)
            if (any(error in output.lower()
                    for error in ["latex error", "fatal error"])):
                where = output.lower().index('error')
                error_msg_at = output[where - 150:where + 150]
                self.path_spec.logger.error(
                    f'{tex_file_path} -->> compilation failed on \n""" {error_msg_at}"""'
                )
                line_number_match = regex.search(r":(\d+):", error_msg_at)
                if line_number_match:
                    line_number = int(line_number_match.groups(1)[0])
                    try:
                        with open(path + "/" + filename) as f:
                            lines = f.readlines()

                    except UnicodeDecodeError:
                        self.path_spec.logger.error(
                            "Could not read latex file because of encoding")
                        break
                    faulty_code = "\n".join(
                        lines[max(0, line_number -
                                  1):min(len(lines), line_number + 1)])
                    self.path_spec.logger.error(
                        f'  --->  see file {tex_file_path}: """\n{faulty_code}"""'
                    )
                return None

        if process.returncode:
            print(errors)
            return None
        self.path_spec.logger.info(f"{tex_file_path} compiled")
        pdf_path = path + "/" + filename_without_extension + ".pdf"
        return pdf_path
示例#12
0
def hh_parse(burl, hdr):
    jobs = []
    urls = [burl]
    soup = soup_content(burl, hdr)

    pagination = soup.find_all('a', attrs={'data-qa': 'pager-page'})
    count = int(pagination[-1].text)
    for i in range(count):
        url = base_url.format(page_num=i)
        if url not in urls:
            urls.append(url)

    for url in urls:
        soup = soup_content(url, hdr)
        divs = soup.find_all('div', attrs={'class': 'vacancy-serp-item'})

        for div in divs:
            location = ""
            salary = ""
            title = div.find('a',
                             attrs={
                                 'data-qa': 'vacancy-serp__vacancy-title'
                             }).text
            href = div.find('a',
                            attrs={'data-qa':
                                   'vacancy-serp__vacancy-title'})['href']
            try:
                company = div.find('a',
                                   attrs={
                                       'data-qa':
                                       'vacancy-serp__vacancy-employer'
                                   }).text
            except Exception as e:
                print(e)
            try:
                salary = div.find('span',
                                  attrs={
                                      'data-qa':
                                      'vacancy-serp__vacancy-compensation'
                                  }).text
            except Exception as e:
                print(e)
            try:
                location = div.find('spam', attrs={'class': 'metro-station'})
            except Exception as e:
                print(e)
            text1 = div.find('div',
                             attrs={
                                 'data-qa':
                                 'vacancy-serp__vacancy_snippet_responsibility'
                             }).text
            text2 = div.find('div',
                             attrs={
                                 'data-qa':
                                 'vacancy-serp__vacancy_snippet_requirement'
                             }).text
            content = text1 + ' ' + text2
            if regex.search(r"\L<words>", title, words=template):
                jobs.append({
                    'title': title,
                    'href': href,
                    'company': company,
                    'location': location,
                    'salary': salary,
                    'content': content
                })
                print("Найдено вакансий:", len(jobs))
    return jobs
示例#13
0
def get_id_from_link(link):
    result = re.search(r"\/title\/([A-Za-z0-9]+)\/", link)
    if result:
        return result.group(1)
    return ""