def class_tr_to_str(tr: bs4.element.Tag) -> str: for a in tr.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (tr.find("td", class_="header"), tr.find("td", class_="description")) nameSpan = data[0].find("span", class_="element-name") if data[0].find("span", class_="attribute-type") is not None: accessType = "param" type_ = data[0].find("span", class_="param-type").text.strip() else: accessType = "func" if accessType == "param": attributeMode = data[0].find("span", class_="attribute-mode").text header = f"`{nameSpan.text} :: {type_}` {attributeMode}" else: header = f"`{nameSpan.text}`" contents = [item for item in data[1].contents if item != " "] if len(contents) > 0: if len(contents) > 1 and "\n" not in contents[0]: description = tomd.convert( f"<p>{''.join([str(item) for item in contents[:-1]])}</p>" ).strip() else: description = contents[0].strip() return f"{header} - {description}" else: return header
def process_product(tag: bs4.element.Tag) -> models.GoodwillItem: try: title_str = tag.find("div", "title").text title_str = title_str.split("Bids: ") bids = 0 if len(title_str) > 1: bids = int(title_str[1].strip()) title = title_str[0].strip() item_number = tag.find("div", "product-number").contents[1] price_str = tag.find("div", "price").text.replace("Buy It Now", "").strip() price = Decimal(re.sub(r"[^\d.]", "", price_str)) end_date_str = tag.find( "div", "timer countdown product-countdown").get("data-countdown") end_date = dt.datetime.strptime(end_date_str, "%m/%d/%Y %I:%M:%S %p") return models.GoodwillItem( title=title, item_number=item_number, price=price, end_date=end_date, bids=bids, ) except Exception as e: print(f"could not create GoodwillItem due to {e} for tag {tag}") return None
def _parse_one_person(self, person: bs4.element.Tag, _stage: str, i: int) -> Dict: _trophy, _, _level, _name = [ i.text.strip() for i in person.find_all("div", attrs={"style": self._style_font_xs}) ] res = { "trophy": int(_trophy), "level": int(_level), "name": _name, "hero": self.hero_map[person.find("img").get("src")], "playerId": person.get("href").split("/")[-1], "isTeammate": False, } if _stage == "Duo Showdown": res["group"] = i // 2 res["is_mvp"] = np.nan elif _stage == "Showdown": res["group"] = i res["is_mvp"] = np.nan else: res["group"] = np.nan res["is_mvp"] = person.find("img", attrs={"src": self._img_mvp}) is not None return res
def extract_stadium_data(stadium: bs.element.Tag) -> dict: stadium_data = { "stadium_name": stadium.find('a').get("title"), "stadium_nick": stadium.find('a').text, "stadium_city": stadium.find_all('a')[1].text } return stadium_data
def from_tag(cls, tag: bs4.element.Tag): category = tag.find("div", class_="cassetteitem_content-label").text title = tag.find("div", class_="cassetteitem_content-title").text address = tag.find("li", class_="cassetteitem_detail-col1").text # Use tuple avoid unhashable error during pandas.drop_duplicates transportation = tuple(div.text for div in tag.select("li.cassetteitem_detail-col2 div")) age, floors = (div.text for div in tag.select("li.cassetteitem_detail-col3 div")) return cls(category, title, address, transportation, parse_age(age), parse_floors(floors))
def get_bloger(self, tag: bs4.element.Tag): try: user_name = tag.find("span", attrs={"class": "ell2"}).a.text user_id = tag.find("span", attrs={ "class": "ell2" }).a["href"].split("/")[-1] except AttributeError as e: print(e) user_name = None user_id = None return User(user_name, user_id)
def _movie_item(item_soup: bs4.element.Tag) -> MovieItem: litpic_soup = item_soup.find('div', attrs={'class': 'litpic'}) title_soup = item_soup.find('div', attrs={'class': 'title'}) title_p_soups = title_soup.find_all('p') litpic_url = litpic_soup.a.img['src'] subject_url = _url(litpic_soup.a['href']) title = title_p_soups[0].b.getText() other = title_p_soups[1].a.getText() info = title_p_soups[2].getText() star = title_p_soups[3].getText() return MovieItem(litpic_url, subject_url, title, other, info, star)
def offers_iter(element: bs4.element.Tag, offers: list) -> list: ''' Get detail from offer''' name = element.h4.text.strip() price = get_price_fr_text( element.find('div', { 'class': 'row-price' }).span.text) shop = element.find('div', {'class': 'shopname'}).text url = element.a['href'] logger.debug('\nName: %s \nPrice: %s \nShop: %s', name, price, shop) tested = {'name': name, 'price': price, 'shop': shop, 'url': url} if (same_product(target, tested)): offers.append(tested) return offers
def _get_ambiguation(tag: bs4.element.Tag) -> str: """ Get pretty ambiguation from example. :return: 'disambiguated' or 'not disambiguated' or 'Not found'. """ ambiguation = (tag.find('span', {'class': 'off'}) or tag.find('span', {'class': 'on'})) if not ambiguation: return 'Not found' ambiguation = ambiguation.text.strip() # TODO: use regexp here # here ambiguation like '[...]' ambiguation = ambiguation[1:-1].strip() return ambiguation
def get_pronounce(p: bs4.element.Tag): return list( map( lambda x: re.match(r'(.*)(\[.*\])', x).groups(), p.find('ul').text.strip().split() ) )
def get_lecture(li: bs4.element.Tag) -> dict: title = li.span.text # print(f"title: {title}") temp_idx = title.rfind(" ") date = title[temp_idx + 1:] temp_idx = date.find("-") if temp_idx == 1: date = f"2021-0{date}" else: date = f"2021-{date}" # print(f"date: {date}") button = li.find("button") onclick_raw = button["onclick"] open_bracket = onclick_raw.find("(") onclick_raw = onclick_raw[open_bracket + 1:] close_bracket = onclick_raw.find(")") onclick_raw = onclick_raw[:close_bracket] raw_texts = onclick_raw.split(",") data_seq = raw_texts[2].strip()[1:-1] # 자료 구분별 식별자 part_seq = raw_texts[3].strip()[1:-1] # 자료의 상위 식별자 # print(f"data_seq: {data_seq}, part_seq: {part_seq}") remote_url = f"https://www.bookdonga.com/utility/download.donga?type=EXTRADATAFILE&fieldname=listen_flnm&data_seq={data_seq}&part_seq={part_seq}" # print(f"remote_url: {remote_url}") print(f"{date},{remote_url},{title}") return { "title": title, "date": date, "data_seq": data_seq, "part_seq": part_seq, "remote_url": remote_url }
def extract_detail(item: bs4.element.Tag) -> DetailData: """商品に関するデータを抽出する Args: item (bs4.element.Tag): 商品情報 Returns: 商品に関するデータ """ item_review_num = item.findAll(**NUM_REVIEWS_PATTERN_ARGS) if len(item_review_num) == 1: item_review_num = item_review_num[0].text.strip() else: item_review_num = [ ir for ir in item_review_num if ir.text.strip().isdecimal() ] item_review_num = item_review_num[0].text.strip() item_star = item.findAll(**STAR_PATTERN_ARGS) if len(item_star) == 1: item_star = item_star[0].text.strip() else: item_star = item_star[-1].text.strip() item_review_num = int(item_review_num.replace(',', '')) item_star = item_star.split(' ')[-1] item_link = item.find(**LINK_PATTERN_ARGS).get('href') return DetailData(item_review_num, item_star, item_link)
def _get_datetime( self, article: newspaper.article.Article, li: bs4.element.Tag, ) -> datetime: publish_date = None if article is not None: publish_date = article.publish_date if publish_date is None: date_list = li.find_all("span", {"class": "info"}) date = None if len(date_list) > 1: date = date_list[-1].text else: date = li.find("span", {"class": "info"}).text date = date.split()[0] if "분" in date: minutes = re.sub(r"[^\d+]", "", date) publish_date = datetime.now() - timedelta(minutes=int(minutes)) elif "시간" in date: hours = re.sub(r"[^\d+]", "", date) publish_date = datetime.now() - timedelta(hours=int(hours)) elif "일" in date: days = re.sub(r"[^\d+]", "", date) publish_date = datetime.now() - timedelta(days=int(days)) else: try: publish_date = datetime.strptime(date, "%Y.%m.%d.") except: pass return publish_date
def _parse_one_block(self, block: bs4.element.Tag) -> Dict: _result = block.find("div", class_=self._class_res).text.strip() _stage, _rewards = [ i.text.strip() for i in block.find_all("div", attrs={"style": self._style_font_m}) ] _type, _time, _map = [ i.text.strip() for i in block.find_all("div", attrs={"style": self._style_font_s}) ] people = block.find_all("a") _people = [ self._parse_one_person(person, _stage, i) for i, person in enumerate(people) ] _people = self._assign_teammates(_people, _stage) return { "match": _result, "stage": _stage, "map": _map, "rewards": int(_rewards), "type": _type, "time": _time, "players": _people, }
def _group_activity_standings_results( eid: int, gid: int, table: bs4.element.Tag) -> Tuple[pd.DataFrame]: cross_table = _cross_table(eid, gid, table) last_row = cross_table.tail(1) results_from = str(last_row.iloc[0, 2]) sep = 'Results from: ' if results_from.startswith(sep): cross_table.drop(last_row.index, inplace=True) file_from, file_date, file_name = _results_from( results_from.split(sep)[1]) else: file_from, file_date, file_name = [np.nan] * 3 # Elo (1978) notation: # M = number of players # N = number of games (here: number of rounds) M, N = cross_table.filter(regex='Results').shape cross_table = cross_table.join(_unplayed_games(M, N, table)) group = (_group(eid, gid, table.find('thead').find_all('tr')).assign( M=M, N=N, file_from=file_from, file_date=file_date, file_name=file_name)) activity = _activity(cross_table) standings = _standings(cross_table) results = _results(cross_table) return group, activity, standings, results
def define_tr_to_str(tr: bs4.element.Tag) -> str: for a in tr.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (tr.find("td", class_="header").string, tr.find("td", class_="description")) contents = data[1].contents if len(contents) > 0: if len(contents) > 1 and "\n" not in contents[0]: description = tomd.convert( f"<p>{''.join([str(item) for item in contents[:-1]])}</p>" ).strip() else: description = contents[0].split('\n')[0].strip() return f"`{data[0]}` - {description}" else: return f"`{data[0]}`"
def _get_graphic_url(content: bs4.element.Tag) -> str or None: """ Get URL to the graphic. """ a = content.find('a', {'target': '_blank'}) try: link = a['href'] except (KeyError, TypeError, AttributeError): return return f"{BASE_RNC_URL}/{link}"
def __init__(self, tag: bs4.element.Tag): self.id: int = int(tag.find('td', {'class': 'id-cell'}).text) self.author: str = tag.find('td', { 'class': 'status-party-cell' }).text.strip().rstrip() parameters = tag.find_all('td', {'class': 'status-small'}) self.date: datetime = datetime.strptime( parameters[0].text.strip().rstrip(), '%d.%m.%Y %H:%M') self.problem: str = parameters[1].text.strip().rstrip() if tag is not None and tag.find( 'td', {'class': 'time-consumed-cell'}) is not None: self.time: str = tag.find('td', { 'class': 'time-consumed-cell' }).text.strip().rstrip() else: self.time: str = '' if tag is not None and tag.find( 'td', {'class': 'memory-consumed-cell'}) is not None: self.memory: str = tag.find('td', { 'class': 'memory-consumed-cell' }).text.strip().rstrip() else: self.memory: str = '' tmp = tag.find('td', {'class': 'status-cell'}) self.status: str = 'WAITING' if tmp is None or tmp.span is None else tmp.span[ 'submissionverdict'] if tmp.span is None and tmp['waiting'] == 'false': self.status = 'UNKNOWN' self.test: int = -1 if tmp is None or \ tmp.find('span', {'class': 'verdict-format-judged'}) is None else \ int(tmp.find('span', {'class': 'verdict-format-judged'}).text)
def get_text_title(quote: bs4.element.Tag) -> Optional[str]: """Return title from html tag class as string""" try: title = quote.find(class_="authorOrTitle") title = title.nextSibling.nextSibling.text title = title.replace("\n", "") return title.strip() except AttributeError: return None
def get_text_author(quote: bs4.element.Tag) -> Optional[str]: """Return author from html tag class as string""" try: author = quote.find(class_="authorOrTitle").text author = author.replace(",", "") author = author.replace("\n", "") return author.strip() except AttributeError: return None
def from_td_tag(cls, quality: Quality, tag: bs4.element.Tag): try: s = tag.find('table').find('tr').find_all('td')[1].find('div').text matches = _price_regex.match(s) return PriceData(quality=quality, quantity=int(matches[1]), price=float(matches[2])) except Exception: pass
def get_text_tags(quote: bs4.element.Tag) -> Optional[list]: """Return tags from html tag class as string""" try: tags = quote.find(class_="greyText smallText left").text tags = [x.strip() for x in tags.split(',')] tags = tags[1:] return tags.strip() except AttributeError: return None
def downstream_points(ts: int, table1: bs4.element.Tag, table2: bs4.element.Tag) \ -> typ.Generator[InfluxPoint, None, None]: global last_correct, last_uncorrect # Read upper table with power levels. _, channel, frequency, snr, modulation, power = \ table1.find('tbody')('tr', recursive=False) channel_ids = [int(td.text) for td in datacells(channel)] snrs = [float(td.text.split(' ')[0]) for td in datacells(snr)] modulations = [td.text.strip() for td in datacells(modulation)] power_levels = [float(td.text.split(' ')[0]) for td in datacells(power)] # Read lower table with codeword counts. _, _, _, correctable, uncorrectable = table2.find('tbody')('tr', recursive=False) correctables = [int(td.text) for td in datacells(correctable)] int_correctables = [ v - last_correct.get(channel_ids[i], v) for i, v in enumerate(correctables) ] uncorrectables = [int(td.text) for td in datacells(uncorrectable)] int_uncorrectables = [ v - last_uncorrect.get(channel_ids[i], v) for i, v in enumerate(uncorrectables) ] last_correct = {channel_ids[i]: v for i, v in enumerate(correctables)} last_uncorrect = {channel_ids[i]: v for i, v in enumerate(uncorrectables)} def field_set(i: int) -> InfluxSet: return { 'snr_db': snrs[i], 'modulation': modulations[i], 'power_dbmv': power_levels[i], 'interval_correctable_codewords': int_correctables[i], 'interval_uncorrectable_codewords': int_uncorrectables[i] } yield from (InfluxPoint(measurement='downstream', tag_set={'channel_id': cid}, field_set=field_set(i), timestamp=ts) for i, cid in enumerate(channel_ids))
def _parse_previous_price(product: bs4.element.Tag) -> decimal.Decimal: """ Parse the fragment with the previous product price. If such fragment doesn't exist, assume there is no price reduction. """ previousprice_tag = product.find("span", class_="as-price-previousprice") if previousprice_tag: return _extract_price(previousprice_tag.get_text()) else: return _parse_current_price(product)
def _parse_media(self, media: bs4.element.Tag) -> Tuple[str, str]: """ Get link to the media file and filepath. """ try: media_link = media.find('a')['href'] except Exception: raise media_link, filename = media_link.split('?name=') return media_link, self.MEDIA_FOLDER / filename
def contains_more_details(self, element: bs4.element.Tag): more_detail_div = element.find("div", class_="TJUuge") if more_detail_div is None: return False spans = more_detail_div.findAll("span") for span in spans: if span.text == "Докладніше": return True return False
def get_noti_id(self, bs4_item_tag: bs4.element.Tag): noti_url = bs4_item_tag.find('link').get_text() url_params = [param for param in noti_url.split('?')[1].split('&') ] # 쿼리 파라메터 추출 for p in url_params: if 'nttSn' in p: return int(p.split('=')[1]) raise ValueError('Could not find post number (sttSn) in Link')
def get_other_page_urls_from_overview_page_stepbridge_my_results(page_soup: bs4.element.Tag) -> list: try: pagination_tag = page_soup.find('ul', {'class': 'pagination'}) page_items = pagination_tag.find_all('li', {'class': 'page-item'}) link_items = [page_item.find('a') for page_item in page_items if page_item.find('a') is not None] page_urls = [link_item['href'] for link_item in link_items] unique_page_urls = list(OrderedDict.fromkeys(page_urls)) return unique_page_urls except AttributeError: return []
def get_event_description(div: bs4.element.Tag) -> str: for a in div.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (div.select("div.element-content > p"), div.find("div", class_="detail-content")) paragraphs = [] for p in data[0]: contents = p.contents if not (len(contents) == 1 and len(contents[0].strip()) == 0): paragraphs.append(html.unescape(tomd.convert(str(p)))) return "\n".join([p.strip().replace("\n", "") for p in paragraphs])
def _parse_doc(self, doc: bs4.element.Tag) -> List[Any]: """ Parse the documents to examples. """ try: media = doc.find('td', {'valign': 'top'}) example = doc.find('td', {'class': 'murco-snippet'}) except ValueError: return [] examples = [] media_url, filename = self._parse_media(media) # for example in example: data_from_example = self._parse_example(example) new_ex = self.ex_type(*data_from_example, media_url, filename) new_ex.mark_found_words(self.marker) self._add_wordforms(new_ex.found_wordforms) examples += [new_ex] return examples
def __parse_author(self, review_item_el: bs4.element.Tag): author_el = review_item_el.find("a", class_="AMrStc") if author_el is not None: if "href" in author_el.attrs: return author_el.attrs['href'] # author_el = review_item_el.find("a", class_="YhR3n") # if author_el is not None: # if "href" in author_el.attrs: # return author_el.attrs['href'] return None