def test_insert_tag(self): builder = self.default_builder soup = self.soup( "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) magic_tag = Tag(soup, builder, 'magictag') magic_tag.insert(0, "the") soup.a.insert(1, magic_tag) self.assertEqual( soup.decode(), self.document_for( "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) # Make sure all the relationships are hooked up correctly. b_tag = soup.b self.assertEqual(b_tag.next_sibling, magic_tag) self.assertEqual(magic_tag.previous_sibling, b_tag) find = b_tag.find(text="Find") self.assertEqual(find.next_element, magic_tag) self.assertEqual(magic_tag.previous_element, find) c_tag = soup.c self.assertEqual(magic_tag.next_sibling, c_tag) self.assertEqual(c_tag.previous_sibling, magic_tag) the = magic_tag.find(text="the") self.assertEqual(the.parent, magic_tag) self.assertEqual(the.next_element, c_tag) self.assertEqual(c_tag.previous_element, the)
def insert_tag(self, tag_dict): """docstring for insert_tag""" tag = Tag(name=tag_dict.pop('name')) tag.attrs = tag_dict if not self.findAll('TAGS'): self.root.append(Tag(name='TAGS')) self.TAGS.append(tag) self.TAGS.append('\n')
def reset(self): Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.current_data = [] self.currentTag = None self.tagStack = [] self.preserve_whitespace_tag_stack = [] self.pushTag(self)
def parse_monster_bbb_tag(tag: Tag, *args): cells = tag.find_all('td') equipment = cells[7].find_all('div') stats = cells[11].find_all('div') magic = cells[12].find_all('div') special = cells[13].find_all('div') statuses = cells[14].find_all('div') monster = dict(hiddenstreet_alias=tag.get('id'), name=cells[0].strong.string.strip(), level=cells[0].contents[-1].strip(), health_points=convert(cells[1], int, True), mana_points=convert(cells[2], int, True), experience=convert(cells[3], int, True), mesos=convert(cells[4], int, True), knockback=convert(cells[5], int), etc_drop=convert(cells[6]), common_equipment=convert(equipment[0]), warrior_equipment=convert(equipment[1]), magician_equipment=convert(equipment[2]), bowman_equipment=convert(equipment[3]), thief_equipment=convert(equipment[4]), pirate_equipment=convert(equipment[5]), ore_drop=convert(cells[8]), maker_item=convert(cells[9]), useable_drop=convert(cells[10]), weapon_attack=convert(stats[0], int), magic_attack=convert(stats[1], int), weapon_defence=convert(stats[2], int), magic_defence=convert(stats[3], int), phisical_dmg_reduction=convert(stats[4], int), magical_dmg_reduction=convert(stats[5], int), speed=convert(stats[6], int), accuracy=convert(stats[7], int), avoidability=convert(stats[8], int), weakness_to_magic=convert(magic[0]), normal_to_magic=convert(magic[1]), resistance_to_magic=convert(magic[2]), immune_to_magic=convert(magic[3]), unique_attack=convert(special[0]), health_points_recovery=convert(special[1], int), mana_points_recovery=convert(special[2], int), immune_against_status=convert(statuses[0]), inflict_status=convert(statuses[1]), common_location=None) monster['image_url'] = cells[0].find('img') if monster['image_url']: monster['image_url'] = monster['image_url'].get('src') try: monster['level'] = int(monster['level']) except ValueError: monster['level'] = None return monster
def knowledge_panel_title_parser(body: element.Tag) -> List[Dict]: """The title of knowledge panels that are clickable""" data = [] for elm in body.find_all(attrs={"data-ru_q": True}): row = element_to_dict(elm, category='link-knowledge_panel_title') data.append(row) return data
def substitute_special_paragraphs(soup): for prefix, klass in prefix2class.items(): substitute_special_paragraph(soup, prefix, klass) make_details = ['comment', 'question', 'doubt'] for c in make_details: for e in list(soup.select('.%s' % c)): details = Tag(name='details') add_class(details, c) summary = Tag(name='summary') summary.append(c) details.append(summary) rest = e.__copy__() details.append(rest) e.replace_with(details)
def conversion_parser(body: element.Tag) -> List[Dict]: """See "how many ounces in a cup".""" data = [] for elm in body.find_all('h2', text='Unit Converter'): row = element_to_dict(elm.parent, category='answer-unit_converter') data.append(row) return data
def parse_news_item(self, row: element.Tag, base_url: str) -> NewsItem: date_cell, info_cell = row.find_all('td') date_string = date_cell.get_text(strip=True) date = parse_datetime(date_string) # Some info cells just have the title as a link, while others have # the title as text followed by links for each language. english_link = info_cell.find('a', string='English') if english_link: url = english_link['href'] # These titles are sometimes followed by a colon. If so, drop # it. (Whitespace will already have been stripped.) title = first_text_in_element(info_cell) if title: title = title.strip(':') else: title_link = info_cell.find('a') url = title_link['href'] title = title_link.get_text(strip=True) if url: url = urljoin(base_url, url) else: raise FormatError('No URL found') if not title: raise FormatError('No title content found') return NewsItem(id=url, url=url, title=title, date_published=date)
def _get_votes_number(self, post: Tag) -> Optional[int]: votes_number_div = post.find('div', class_=self.VOTES_NUMBER_CLASS) if votes_number_div is None: # not a post return None if 'k' in votes_number_div.text: # votes specified in thousands return int(votes_number_div.text[:-1]) * 1000 return int(votes_number_div.text)
def dict_def_parser(body: element.Tag) -> List[Dict]: """Dictionary definitions, gets the whole card.""" data = [] for elm in body.find_all('div', attrs={'id': 'dictionary-modules'}): row = element_to_dict(elm, category='answer-dictionary') data.append(row) return data
def ads_aria_parser(body: element.Tag) -> List[Dict]: """Catches ADs with a accessibility features""" data = [] for elm in body.find_all(attrs={'aria-label': 'Ad'}): row = element_to_dict(elm, category='ads-aria') data.append(row) return data
def parse_truyenfull_chapters(self, soup: Tag): truyen_id = self.__select_value(soup, 'input#truyen-id', 'value') total_page = self.__select_value(soup, 'input#total-page', 'value') truyen_ascii = self.__select_value(soup, 'input#truyen-ascii', 'value') assert truyen_id, 'No truen novel id found' total_page = int(str(total_page)) logger.info('Total page count: %d', total_page) futures: List[Future] = [] for page in range(total_page): params = urlencode({ 'type': 'list_chapter', 'tid': int(truyen_id), 'tascii': truyen_ascii, 'tname': self.novel_title, 'page': page + 1, 'totalp': total_page, }) url = 'https://truyenfull.vn/ajax.php?' + params logger.info('Getting chapters: %s', url) f = self.executor.submit(self.get_json, url) futures.append(f) # end for for f in futures: data = f.result() soup = self.make_soup(data['chap_list']) self.parse_all_links(soup.select('.list-chapter a'))
def replace_emoji_imgs(element: Tag) -> None: for img in element.find_all('img'): match = EMOJI_IMG_SRC_RX.match(img.get('src', '')) if match: emoji = EMOJI_MAP.get(match.group(1)) if emoji: img.replace_with(emoji)
def __init__(self, table_row: Tag) -> None: """コンテスト一覧ページ内のテーブルのある行タグから,コンテストインスタンスを初期化する. Args: table_row (Tag): 行タグ """ table_data_list: List[Tag] = table_row.select('td') time_str: str = table_data_list[0].get_text() self.time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S+0900') self.time_unix = int(self.time.timestamp()) contest_tag: Tag = table_data_list[1].find('a') contest_href_match: Optional[ Match[str]] = self.contest_href_pattern.search( contest_tag['href']) assert contest_href_match is not None self.contest_slug = contest_href_match.group(1) self.contest_name = contest_tag.get_text() duration_str: str = table_data_list[2].get_text() duration_match: Optional[ Match[str]] = self.duration_pattern.search(duration_str) assert duration_match is not None hours: int = int(duration_match.group(1)) minutes: int = int(duration_match.group(2)) self.duration_minutes = hours * 60 + minutes
def set_number_available(self, raw_line: element.Tag): """Sets number_available class attribute from raw DOM element. Args: raw_line (element.Tag): Raw DOM element. Raises: _CUSTOM_ERRORS.CouldNotParseInfo: If the info cannot be found. Missing info will be mentionned in error message. """ value = raw_line.find("td") if value is None: self.logger.write( log_level="error", message="Could not parse number of books available on book page.", ) raise _CUSTOM_ERRORS.CouldNotParseInfo( title=self.title, info="Number available", url=self.url ) number = re.findall("([0-9]+) available", value.get_text()) if len(number) == 0: self.logger.write( log_level="error", message="Could not parse number of books available on book page.", ) raise _CUSTOM_ERRORS.CouldNotParseInfo( title=self.title, info="Number available", url=self.url ) self.number_available = int(number[0])
def stats_per_10_mins(career: Tag, ID: str) -> dict: all_stats = {} stats_10 = {} per_10 = ' - Avg per 10 Min' stats_soup = career.find('div', attrs = {'data-category-id': ID}) for table in stats_soup.children: for tr in table.tbody.children: stat = tr.contents[0].text.replace(' Done', '').strip() info = tr.contents[1].text for word in ('Blow', 'Kill'): if word in stat and f'{word}s' not in stat: stat = stat.replace(word, f'{word}s').strip() all_stats[stat] = info if per_10 in stat or ('Accuracy' in stat and 'Best' not in stat): stats_10[stat.replace(per_10, '').strip()] = None if stat == 'Time Played': total_time = clock_to_mins(info) / 10 for stat in stats_10: if '%' in all_stats[stat]: stats_10[stat] = all_stats[stat] elif ':' in all_stats[stat]: stats_10[stat] = mins_to_clock(clock_to_mins(all_stats[stat]) / total_time) else: stats_10[stat] = round(float(all_stats[stat]) / total_time, 2) return stats_10
def get_title(row: element.Tag) -> str: """Get the title of a row.""" try: return row.find("a", {"class": "elco-anchor"}).text.strip() except Exception as e: logger.debug("Function get_title for row %s : %s", row, e) return None
def _extract_metadata_xml(root_el: bs4e.Tag) -> SessionMetadata: head_el = root_el.vorspann.kopfdaten sv_el = root_el.sitzungsverlauf sn_el = head_el.find("sitzungsnr") lp_el = head_el.find("wahlperiode") date_str = root_el.get("sitzung-datum") session_start = root_el.get("sitzung-start-uhrzeit") session_end = root_el.get("sitzung-ende-uhrzeit") return SessionMetadata( session_no=get_session_id_safe(lp_el.getText(), sn_el.getText()), legislative_period=int(lp_el.getText()), start=build_datetime(date_str, session_start), end=build_datetime(date_str, session_end))
def tab_parser(body: element.Tag) -> List[Dict]: """For tabs, sometimes on knowledge panels. # check this""" data = [] for elm in body.find_all(role='tab'): # what if we get rid of 'a' row = element_to_dict(elm, category='link-knowledge_panel_tab') data.append(row) return data
def fullpage_popup_parser(body: element.Tag) -> List[Dict]: """A clickthru of a fullpage. See events like "New Years Eve Party".""" data = [] for elm in body.find_all('li', attrs={'data-encoded-docid': True}): row = element_to_dict(elm, category='link-fullpage') data.append(row) return data
def get_item_info(item: Tag) -> dict: response = dict() response['vacancy_name'] = item.find('div', { 'class': 'search-item-name' }).find('a').text response['employer'] = item.find('div', { 'class': 'vacancy-serp-item__meta-info' }).find('a').text response['location'] = item.find('span', { 'class': 'vacancy-serp-item__meta-info' }).text salary = item.find('div', { 'class': 'vacancy-serp-item__sidebar' }).find('div') response['salary'] = 'з/п не указана' if salary is None else salary.text return response
def check_if_removed_from_bugblog(bbt: Match, b: Tag, issue: Issue) -> None: if bbt is not None: text = strings.remove_smartquotes(bbt.group(1).strip()) for row in b.find_all('tr'): data = row.find_all('td') rowtext = strings.remove_smartquotes(data[1].text.strip()) if rowtext == text: break if strip_squarebrackets(rowtext) == strip_squarebrackets(text): # Fix this print( "Issue #{id}'s bug blog text has differing autocard notation." .format(id=issue.number)) old_bbt = strings.get_body_field(issue.body, 'Bug Blog Text') body = re.sub(BBT_REGEX, 'Bug Blog Text: {0}'.format(rowtext), issue.body, flags=re.MULTILINE) new_bbt = strings.get_body_field(body, 'Bug Blog Text') issue.edit(body=body) print('Updated to `{0}`'.format(rowtext)) issue.create_comment( f'Changed bug blog text from `{old_bbt}` to `{new_bbt}`') break else: print('{id} is fixed!'.format(id=issue.number)) repo.create_comment( issue, 'This bug has been removed from the bug blog!') issue.edit(state='closed')
def ads_local_parser(body: element.Tag) -> List[Dict]: """Localized ADs""" data = [] for elm in body.find_all('li', attrs={'class': re.compile("^ads-")}): row = element_to_dict(elm, category='ads-text') data.append(row) return data
def __init__(self, name: str, title: str, td: Tag): super().__init__(title) img = td.find('img') if 'NoPhoto' in img.attrs.get('src', ''): self.link: OptionalStr = None else: self.link = img.attrs.get('zoomimg')
def ebook_parser(body: element.Tag) -> List[Dict]: data = [] for elm in body.find_all('g-expandable-content', attrs={ 'jscontroller': True, 'jsaction': True, 'jsshadow': True, 'aria-hidden': True, 'data-eb': True, 'data-mt': True, 'data-quie': True, 'data-ved': True }): for div in elm.find_all('div', recursive=True, attrs={ 'class': True, 'jsname': True, 'role': 'button', 'aria-haspopup': True, 'tabindex': True, 'jsaction': True }): for e in div.find_all('div', text=True): category = 'organic' if e.text == 'Google Play Books': category = 'link-google_play_books' row = element_to_dict(div, category=category) data.append(row) return data
def _parse_row(row: element.Tag, header: list) -> List[object]: """ Parses individual rows from table and returns a dictionary """ info_list = [item.text for item in row.find_all('td')] info = dict(zip(header, info_list)) return info
def parsePcThreadHeader(liTag: Tag): import json metadata = json.loads(liTag.attrs.get("data-field")) thread = ThreadHeader() thread.kz = metadata.get("id") thread.author_name = metadata.get("author_name") thread.author_nickname = metadata.get("author_nickname") thread.author_portrait = metadata.get("author_portrait") thread.first_post_id = metadata.get("first_post_id") thread.reply = metadata.get("reply_num") thread.bakan = metadata.get("is_bakan") thread.vid = metadata.get("vid") thread.good = metadata.get("is_good") thread.top = metadata.get("is_top") thread.protal = metadata.get("is_protal") thread.membertop = metadata.get("is_membertop") thread.multi_forum = metadata.get("is_multi_forum") thread.frs_tpoint = metadata.get("frs_tpoint") titleATag = liTag.select_one(".j_th_tit a") if titleATag: thread.title = titleATag.text thread.mod_date = datetime.now() if not thread.kz: thread = None return thread
def elementClass(self, name, namespace): if namespace is not None: warnings.warn( "BeautifulSoup cannot represent elements in any namespace", DataLossWarning) return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
def check_for_missing_bugs(b: Tag) -> None: for row in b.find_all('tr'): data = row.find_all('td') row_text = data[1].text.strip() if row_text == 'Description': # BS4 is bad. continue issue = find_issue_by_code(row_text) if issue: labels = [c.name for c in issue.labels] categories = [c for c in labels if c in strings.METACATS] if categories: continue bbcat = re.match(strings.REGEX_BBCAT, data[2].text.strip()) if bbcat is None: continue g1 = bbcat.group(1).strip() if g1 in strings.METACATS: issue.add_to_labels(g1) continue if bbcat.group(2) is not None: g2 = bbcat.group(2).strip() if g2 in strings.METACATS: issue.add_to_labels(g2) continue print(f'Unknown BBCat: {bbcat.group(0)}') continue print('Could not find issue for `{row}`'.format(row=row_text)) text = 'From Bug Blog.\nBug Blog Text: {0}'.format(row_text) repo.get_repo().create_issue(strings.remove_smartquotes(row_text), body=strings.remove_smartquotes(text), labels=['From Bug Blog'])
def get_rss_item_tags(self, item: Tag) -> list[str]: tags = item.find(self.rss_tags_name) if tags is not None: return [tag.strip() for tag in tags.text.split(",")] return []
def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" for attr in SEARCH_END_TAG_ATTRS: if attr in tag.get("class", ()): return True return tag.name == "table"
def getRecommendationGrade(recommendation: Tag) -> str: """Extract the grade for a given recommendation. Arguments: recommendation {Tag} -- Recommendation tag to be parsed. Raises: Exception -- Raised when grade is not found. Returns: str -- Grade of the recommendation. """ # Change if additional grades are added possible_grades = ['A', 'B', 'C', 'E'] # Extract 'strong' type objects strong_objects = recommendation.find_all('strong') # Only extract if grade for strong in strong_objects: if strong.text in possible_grades: return strong.text # No recommendation grade found raise Exception
def filter_parser(body: element.Tag) -> List[Dict]: """Checks for filters""" data = [] for elm in body.find_all(attrs={"role": "button", "aria-pressed": True}): row = element_to_dict(elm, category='link-filter') data.append(row) return data
def extractinfo(info: Tag) -> None: if info is not None: workid = str(info['href']).split('=')[1] ps = info.find_all('p') worktitle = str(ps[0]).strip('<p>/') workauthor = str(ps[1]).strip('<p>/') wrks.append(Workinfo(workid, worktitle, workauthor))
def embed_css_files(soup): """ Look for <link> elements of CSS and embed them if they are local files""" # <link href="..." rel="stylesheet" type="text/css"/> for link in list( soup.findAll('link', attrs={ 'rel': 'stylesheet', 'href': True })): href = link.attrs['href'] if href.startswith('/'): # not on windows? logger.info('Embedding %r' % href) data = open(href).read() style = Tag(name='style') style.attrs['type'] = 'text/css' style.string = data link.replace_with(style)
def _get_report_type_and_subject_variants(form: Tag): """ В форме на сервере содержаться два выпадающих списка Для формирования отчета необходимо отправить на сервер все варианты пар значений из первого поля и из второго, то есть таким образом данный метод формирует декартово произведение множеств из элементов этих списков. :param form: форма, в которой будем искать <select>`ы :type form: Tag :rtype: list """ from itertools import product select_subject_attr_name_value = setting['base-param-for-form']['name-field-select-subject'] select_report_type_attr_name_value = setting['base-param-for-form']['name-field-select-report'] # теперь нам нужно извлечь данные из select`ов # тут у нас селект с типами отчетов select_report_type = form.find(attrs={'name': select_report_type_attr_name_value}) options_select_report_type = select_report_type.find_all('option') # тут с подразделениями лукойла select_subject = form.find(attrs={'name': select_subject_attr_name_value}) select_subject_report_type = select_subject.find_all('option') assert select_report_type and select_subject, 'Не удалось найти теги <select>' selects = [] # в selects храним список словарей с двумя ключами ctl00$ContentPlaceHolder1$ddlTip и # ctl00$ContentPlaceHolder1$Subjects и соответствующими значениями - то есть комбинации # двух селектов т. е. декартово произведение двух множеств. for type_option, subject in product(options_select_report_type, select_subject_report_type): selects.append({ select_report_type['name']: type_option['value'], select_subject['name']: subject['value'] }) return selects
def create_question(self, element:Tag) -> Question: id = element.attrs["id"] title_element = element.select(Scrapper.QUESTION_TITLE)[0] title = title_element.text link = Scrapper.WEBSITE + title_element.attrs["href"] user_name = element.select(Scrapper.USER_NAME)[0].text reputation_string = element.select(Scrapper.REPUTATION)[0].text.replace(",", "").replace("k", "000") reputation = float(reputation_string) tags = [i.text for i in element.select(Scrapper.TAG)] votes = int(element.select(Scrapper.VOTES)[0].text) answers = int(element.select(Scrapper.ANSWERS)[0].text) views = int(element.select(Scrapper.VIEWS)[0].text.replace("views", "")) time = element.select(Scrapper.TIME)[0].text user = User(user_name, reputation) return Question(id, title, user, link, votes, answers, views, tags, time)
def get_xml(self): xml = u'<?xml version="1.0" encoding="UTF-8" ?>\n' root = Tag(name=self.task) text = Tag(name='TEXT') text.append(CData(self.text())) tags = self.TAGS tokens = (BS( self.tokenizer.get_tokenized_as_xml().encode('utf-8'), 'xml' )).TOKENS elements = [u'\n', text, u'\n', tags, u'\n', tokens, u'\n'] for element in elements: if element: # if missing tags, system will crash root.append(element) xml += unicode(root) return xml
def convert(tag: Tag, field_type: Generic(str, int)=str, strip_comma=False): result = tag.find('div', class_='field-item') if result: result = result.string elif tag.div and 'field-label-inline' in tag.div.get('class'): result = tag.div.contents[-1].strip() else: try: result = tag.contents[-1].strip() except AttributeError: result = None if field_type == str: if result == '-' or result == '?': result = None else: if strip_comma: result = result.replace(',', '') try: result = field_type(result) except ValueError: result = None return result
def parse_weapon_tag(tag: Tag, weapon_type): cells = tag.find_all('td') weapon = dict(name=cells[1].strong.a.string, weapon_type=weapon_type.value, required_level=convert(cells[2], int), required_stats=convert(cells[3]), weapon_attack=convert(cells[4], int), attack_speed=convert(cells[5]), job=convert(cells[6]), effects=convert(cells[7]), available_upgrades=convert(cells[8], int), sold_for=convert(cells[9]), dropped_by=convert(cells[10]), available_from=None, remarks=None) try: tmp = weapon['sold_for'].index(' ') weapon['sold_for'] = int(weapon['sold_for'][:tmp].replace(',', '')) except ValueError: weapon['sold_for'] = 0 return weapon
def setProperty(self,name,value): if type(name)!=str or type(value)!=str: print_error('key and value must be str') return False if not verify(name,value): return False ele=self.soup.find(name=NAME,text=name) if ele==None: #print "create it" p=Tag(name='property') n=Tag(name=NAME) n.string=name v=Tag(name=VALUE) v.string=value p.append(n) p.append(v) configuration_tag=self.soup.find('configuration') configuration_tag.append(p) else: ele.parent.find(VALUE).string=value #print "set succed it" return True
def extract_by_class(cls, review: Tag, class_name: str) -> Tag: return review.find(class_=class_name)