def parse_thread_page(el: bs4.element.Tag) -> AttrDict: out = AttrDict() out.user = el.select('.postprofile dt')[0].text.strip() out.body_html = str(el.select('.content')[0]).strip() out.body_text = el.select('.content')[0].text.strip() out.date = el.select('.postbody .author')[0].text.strip() return out
def parse_link(link: bs4.element.Tag, domain: str) -> AttrDict: out = AttrDict() out.title = link.select('a:nth-of-type(1)')[0].text out.views = link.select('.views')[0].text.replace('Zugriffe', '').strip() out.answers = link.select('.posts')[0].text.replace('Antworten', '').strip() out.date = link.select('a:nth-of-type(3)')[0].text out.url = domain + link.select('a:nth-of-type(1)')[0].attrs['href'].replace('./', '/') return out
def from_tag(cls, tag: bs4.element.Tag): category = tag.find("div", class_="cassetteitem_content-label").text title = tag.find("div", class_="cassetteitem_content-title").text address = tag.find("li", class_="cassetteitem_detail-col1").text # Use tuple avoid unhashable error during pandas.drop_duplicates transportation = tuple(div.text for div in tag.select("li.cassetteitem_detail-col2 div")) age, floors = (div.text for div in tag.select("li.cassetteitem_detail-col3 div")) return cls(category, title, address, transportation, parse_age(age), parse_floors(floors))
def get_explain(e: bs4.element.Tag): def f(ks): return ('pos' if 'pos_button' in ks else 'explain' if 'dictionaryExplanation' in ks else '?') return [(f(m.attrs['class']), m.text) for n in e.select('ul > li') for m in n.select('div')]
def parse_review(review: bs4.element.Tag) -> dict: """ INPUT: review: HTML segment that contains all relevant review information OUTPUT: d: dictionary of relevant review information """ d = {} if review.select_one("div.rating-10 span"): d['rating'] = int(review.select_one("div.rating-10 span").text) d['headline'] = review.select_one("h2.text_header").text try: d['country'] = review.select_one('h3.text_sub_header').text\ .replace(')', '(').split('(')[1] except IndexError: d['country'] = 'None' d['body'] = review.select_one("div.text_content").text.strip() rows = review.select('tr') for row in rows: if row.select('td')[1].attrs['class'][0] == 'review-rating-stars': for x in row.select('span'): try: if x.attrs['class'] == ['star', 'fill']: num = int(x.text) d[row.td.attrs['class'][1]] = num except KeyError: continue else: d[row.td.attrs['class'][1]] = row.select('td')[1].text return d
def parse_event(event: bs4.element.Tag): """イベントひとつ分の要素から情報を抜き出す""" url = event.select_one('.events-list-item-title h3 a').get('href') community = event.select_one('.events-list-item-group a') community = community.text if community else None # thumbnail = event.select_one('.event_thumbnail img').get('src') # thumbnail = '' if re.search(r'/no_image_', thumbnail): thumbnail = None return Event( id=int(re.match(r'.+/(\d+)/?', url)[1]), title=event.select_one('.events-list-item-title h3 a span').text, url=url, dt_start=datetime.strptime( event.select_one('time').get('datetime'), '%Y-%m-%dT%H:%M:%S%z'), #2019-10-12T13:00:00+09:00 dt_end=datetime.strptime( event.select_one('time').get('datetime'), '%Y-%m-%dT%H:%M:%S%z'), # amount = event.select_one('.amount').text, # thumbnail = thumbnail, community=community, owner=community, place=''.join( map(lambda x: x.text, event.select('.events-list-item-venue > span'))))
def get_explain(e: bs4.element.Tag): def f(ks): return ( 'pos' if 'pos_button' in ks else 'explain' if 'dictionaryExplanation' in ks else '?') return [ (f(m.attrs['class']), m.text) for n in e.select('ul > li') for m in n.select('div')]
def __init__(self, item: bs4.element.Tag): self.name = item.select_one('span.txt').text sub_menu = item.select('li.listItem a') if len(sub_menu) == 0: self.link, self.id = solve_link(item.select_one('a')['href']) return for item in sub_menu: self.append(sub_industry(item))
def __find_link(article: bs4.element.Tag) -> str: links: Counter = Counter() first_link = '' for header in ['h1', 'h2', 'h3']: header_link = article.select(f'{header} a[href]') if header_link: return header_link[0].attrs.get('href') for a_element in article.select('a[href]'): if not first_link: first_link = a_element.attrs.get('href') links.update([a_element.attrs.get('href')]) if len(links) == 0: return '' most_common = links.most_common()[0][0] return (links.most_common()[0][0] if (links.most_common()[0][1] > links.get(first_link, 0) and (most_common.startswith('/') or most_common.startswith('http'))) else first_link)
def get_event_description(div: bs4.element.Tag) -> str: for a in div.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (div.select("div.element-content > p"), div.find("div", class_="detail-content")) paragraphs = [] for p in data[0]: contents = p.contents if not (len(contents) == 1 and len(contents[0].strip()) == 0): paragraphs.append(html.unescape(tomd.convert(str(p)))) return "\n".join([p.strip().replace("\n", "") for p in paragraphs])
def get_event_contents(div: bs4.element.Tag) -> List[str]: contains = div.select("div.detail-content > div") props = [] for prop in contains: text = prop.text.replace(" ", " ") searchResult = propEx.search(text).groups() if searchResult[2] is not None: props.append(f"`{searchResult[0]}` - {searchResult[2]}") else: props.append(f"`{searchResult[0]}`") return props
def empirical_dispersion_parser(parsed_html: bs4.element.Tag) -> list: EMPIRICAL_DISPERSION = 'EmpiricalDispersion' result = {EMPIRICAL_DISPERSION: []} for item in [ str(span.text) for span in parsed_html.select('span[class="kwit"]') ]: if EMPIRICAL_DISPERSION in item: if '=' in item: result[EMPIRICAL_DISPERSION].append(item.split('=')[-1]) else: result[item] = [] return result
def extract_all_tags(self, tag: str, node: bs4.element.Tag): ''' This function use the select operator defined by beautifulsoup to return the list of all tag @tag inside the node @node We can choose every node e.g. xml_code="<xml><function><if>if <condition>(<expr><name>var</name></expr>)</condition></if></function></xml>" parser=SrcmlParser(xml_code) tags=parser.extract_all_tags("if", parser.soup) print(len(tags)) ''' tags = node.select(tag) return tags
def pure_functionals_parser(parsed_html: bs4.element.Tag) -> list: def exchange_functionals(parsed_html: bs4.element.Tag) -> list: ul = section.find_next_sibling('ul') for item in ul.select('span[class="kwit"]'): yield item.text def correlation_functionals(parsed_html: bs4.element.Tag) -> list: # Correlation part has two sections ul = section.find_next_sibling('ul') for item in ul.select('span[class="kwit"]'): yield item.text ul = ul.find_next_sibling('ul') for item in ul.select('span[class="kwit"]'): yield item.text def standalone_functionals(parsed_html: bs4.element.Tag) -> list: ul = section.find_next_sibling('ul') for item in ul.select('span[class="kwit"]'): yield item.text exchange = [] correlation = [] standalone = [] mapper = { 'Exchange Functionals': (exchange_functionals, exchange), 'Correlation Functionals': (correlation_functionals, correlation), 'Standalone Pure Functionals': (standalone_functionals, standalone) } sections = parsed_html.select('h3[class="ksection"]') for section in sections: func, store = mapper[str(section.text)] store.extend(list(func(section))) # exchange * correlation result = [] # XXX we have to consdier those stand-alone exchange functionals, like # S/HFS, XA/XAlpha, B/HFB, etc. for exchange_item in exchange: for correlation_item in correlation: result.append(exchange_item + correlation_item) result.extend(standalone) return result
def extract_birthdays(ultag: bs4.element.Tag) -> list: '''ulタグ内のliから誕生日リストを全て取ってリストで返す''' ret = [] li_elms = ultag.select("li") for litag in li_elms: brtdy: birthday.Birthday = birthday.parse_birthday(litag.get_text()) if brtdy is None: continue yearstr = brtdy.yearstr if brtdy.year == birthday.YEAR_UNKNOWN else str( brtdy.year) print(yearstr + ": " + brtdy.name + " (" + brtdy.occupation + ")") ret.append(brtdy) return ret
def from_html(cls, tr: bs4.element.Tag) -> "NsdiLandUsingInfo": td_list = tr.select("td") data_type = td_list[0].text.strip() city_type = td_list[1].text.strip() name_type = td_list[2].text.strip() base_date = td_list[3].text.strip() file_size = td_list[4].text.strip() button_value = td_list[5].select("button")[0] table_data = NsdiLandUsingInfo.NsdiTableData.from_html(button_value) return cls(data_type=data_type, city_type=city_type, name_type=name_type, base_date=base_date, file_size=file_size, table_data=table_data, raw_data=str(tr))
def get_grammar(d: bs4.element.Tag): s = ('div#web ol.searchCenterMiddle ' 'div.dictionaryWordCard > ul > li') return list(map(text, d.select(s)))
def hybrid_functionals_parser(parsed_html: bs4.element.Tag) -> list: return [ str(span.text) for span in parsed_html.select('span[class="kwit"]') if 'IOp' not in span.text ]