示例#1
0
 def core_box_extraction(div_id, edit_funcs):
     for field, div_string in MobyGamesExtractor.headers[div_id].items():
         core_div = b.find('div', id=div_id).find('div', string=div_string)
         if core_div:
             ts = [replace_xa0(x.text) for x in core_div.next_sibling()]
             main_dict[field] = reduce((lambda terms, func: func(terms, field)),
                                       edit_funcs,
                                       [replace_xa0(x.text) for x in core_div.next_sibling()])
示例#2
0
    def scrape_rating_page(page_data):
        main_dict = {}
        if not page_data:
            return main_dict

        b = bs4.BeautifulSoup(page_data.text, 'html.parser')

        main_dict['rating_platform'] = {}

        for h in b.find_all('h2'):
            rating_dict = dict([tuple(replace_xa0(tr.text).split(u':', 1)) for tr in h.next_sibling.find_all('tr')])
            main_dict['rating_platform'][replace_xa0(h.text)] = rating_dict

        return main_dict
示例#3
0
    def scrape_specs_page(page_data):
        main_dict = {}
        if not page_data:
            return main_dict

        b = bs4.BeautifulSoup(page_data.text, 'html.parser')

        main_dict['specs'] = []

        # All spec information in platform specific tables
        for table in b.find_all('table', class_='techInfo'):
            tds = table.find_all('td')[1:]  # First td is table header title
            table_dict = dict([(snake_case(replace_xa0(h.text)),
                               [replace_xa0(a.text) for a in v.find_all('a')]) for h, v in pairwise(tds)])
            table_dict['platform'] = replace_xa0(table.find('thead').text)
            main_dict['specs'].append(table_dict)

        return main_dict
示例#4
0
 def extract_credits_table(table_object):
     credits_dict = {}
     cred_header = None
     for tr in table_object.find_all('tr'):
         tds = tr.find_all('td')
         if len(tds) == 1:
             cred_header = replace_xa0(tds[0].text)
             if cred_header == u'Credits':
                 cred_header = u'General'
             credits_dict[cred_header] = {}
         else:
             h_text = replace_xa0(tds[0].text)
             # This searches by linked names, so does not catch additional text not in an <a> tag
             # So if <a>{person name}</a>{other text}, only person name is captured
             v = {'names': [replace_xa0(a.text) for a in tds[1].find_all('a')],
                  'full_text': replace_xa0(tds[1].text)}
             credits_dict[cred_header][h_text] = v
     return credits_dict
示例#5
0
    def get_wiki_table(cls, soup_object):
        trs = soup_object.find('table', class_='infobox').find_all('tr')
        table = {}
        for tr in trs:
            # Look at each pair of td tags, since infobox rows that we care about have two columns
            for h_text, v_text in [(replace_xa0(h.text), v.text) for h, v in pairwise(tr.find_all('td'))]:
                if h_text in WikipediaExtractor.headers_to_terms:
                    comp_term = WikipediaExtractor.headers_to_terms[h_text]
                    # removes blank lines and blank indexes
                    table[comp_term] = [x for x in re.split('\n+', v_text) if x != u'']

        return table
示例#6
0
 def create_div_tuple(div):
     if div.name == u'h2':
         # ('header', platform name)
         return 'header', replace_xa0(div.text)
     elif div.name == u'b':
         # ('patch', None)
         return 'patch', None
     elif 'class' in div.attrs and u'relInfo' in div['class']:
         # ('patch_rel_info', {relInfoTitle: relInfoDetails, ...})
         return 'patch_rel_info', dict([(replace_xa0(div.find(class_='relInfoTitle').text),
                                         replace_xa0(div.find(class_='relInfoDetails').text))])
     elif 'class' in div.attrs and u'floatholder' in div['class']:
         # ('attr', {attr_name: [value, ...]})
         return 'attr', {snake_case(replace_xa0(div.find(class_='fl').text)): [a.text for a in div.find_all('a')]}
     elif div.find(class_='relInfo'):
         # ('rel_info', {relInfoTitle: relInfoDetails, ...})
         return 'rel_info', dict([(replace_xa0(r.find(class_='relInfoTitle').text),
                                   replace_xa0(r.find(class_='relInfoDetails').text)) for r in div.find_all(class_='relInfo')])
     else:
         return None, None