def core_box_extraction(div_id, edit_funcs): for field, div_string in MobyGamesExtractor.headers[div_id].items(): core_div = b.find('div', id=div_id).find('div', string=div_string) if core_div: ts = [replace_xa0(x.text) for x in core_div.next_sibling()] main_dict[field] = reduce((lambda terms, func: func(terms, field)), edit_funcs, [replace_xa0(x.text) for x in core_div.next_sibling()])
def scrape_rating_page(page_data): main_dict = {} if not page_data: return main_dict b = bs4.BeautifulSoup(page_data.text, 'html.parser') main_dict['rating_platform'] = {} for h in b.find_all('h2'): rating_dict = dict([tuple(replace_xa0(tr.text).split(u':', 1)) for tr in h.next_sibling.find_all('tr')]) main_dict['rating_platform'][replace_xa0(h.text)] = rating_dict return main_dict
def scrape_specs_page(page_data): main_dict = {} if not page_data: return main_dict b = bs4.BeautifulSoup(page_data.text, 'html.parser') main_dict['specs'] = [] # All spec information in platform specific tables for table in b.find_all('table', class_='techInfo'): tds = table.find_all('td')[1:] # First td is table header title table_dict = dict([(snake_case(replace_xa0(h.text)), [replace_xa0(a.text) for a in v.find_all('a')]) for h, v in pairwise(tds)]) table_dict['platform'] = replace_xa0(table.find('thead').text) main_dict['specs'].append(table_dict) return main_dict
def extract_credits_table(table_object): credits_dict = {} cred_header = None for tr in table_object.find_all('tr'): tds = tr.find_all('td') if len(tds) == 1: cred_header = replace_xa0(tds[0].text) if cred_header == u'Credits': cred_header = u'General' credits_dict[cred_header] = {} else: h_text = replace_xa0(tds[0].text) # This searches by linked names, so does not catch additional text not in an <a> tag # So if <a>{person name}</a>{other text}, only person name is captured v = {'names': [replace_xa0(a.text) for a in tds[1].find_all('a')], 'full_text': replace_xa0(tds[1].text)} credits_dict[cred_header][h_text] = v return credits_dict
def get_wiki_table(cls, soup_object): trs = soup_object.find('table', class_='infobox').find_all('tr') table = {} for tr in trs: # Look at each pair of td tags, since infobox rows that we care about have two columns for h_text, v_text in [(replace_xa0(h.text), v.text) for h, v in pairwise(tr.find_all('td'))]: if h_text in WikipediaExtractor.headers_to_terms: comp_term = WikipediaExtractor.headers_to_terms[h_text] # removes blank lines and blank indexes table[comp_term] = [x for x in re.split('\n+', v_text) if x != u''] return table
def create_div_tuple(div): if div.name == u'h2': # ('header', platform name) return 'header', replace_xa0(div.text) elif div.name == u'b': # ('patch', None) return 'patch', None elif 'class' in div.attrs and u'relInfo' in div['class']: # ('patch_rel_info', {relInfoTitle: relInfoDetails, ...}) return 'patch_rel_info', dict([(replace_xa0(div.find(class_='relInfoTitle').text), replace_xa0(div.find(class_='relInfoDetails').text))]) elif 'class' in div.attrs and u'floatholder' in div['class']: # ('attr', {attr_name: [value, ...]}) return 'attr', {snake_case(replace_xa0(div.find(class_='fl').text)): [a.text for a in div.find_all('a')]} elif div.find(class_='relInfo'): # ('rel_info', {relInfoTitle: relInfoDetails, ...}) return 'rel_info', dict([(replace_xa0(r.find(class_='relInfoTitle').text), replace_xa0(r.find(class_='relInfoDetails').text)) for r in div.find_all(class_='relInfo')]) else: return None, None