def google_light_parse(gresult): gresult = parse_util.x_to_soup(gresult) parse_dict = dict() resultStats = input.find(name='div', attrs={'id': 'resultStats'}) if resultStats: parse_dict['_resultStats'] = google.parse_number_of_results( resultStats)
def element_presence_dict(input): input = putil.x_to_soup(input) html = input.renderContents() text = input.get_text(' ') d = dict() # search_term_redirect (when google shows results for a different search term than the entered--should be in _spell) d['isa_search_term_redirect'] = bool( re.search('(Showing results for)|(Including results for)', text)) # did_you_mean d['did_you_mean'] = bool(re.search('Did you mean', text)) # number of results d['number_of_results'] = re.findall('About (' + RE_NUMBER + ') results', text) if len(d['number_of_results']) > 0: d['isa_number_of_results'] = True else: if bool(re.search('Your search .* did not match any documents.')): d['number_of_results'] = 0 else: d['number_of_results'] = -1 # number of center_col_elements center_col = input.find('div', {'id': 'center_col'}) if center_col: d['num_of_center_col_children'] = len([x for x in center_col.children]) else: d['num_of_center_col_children'] = 0
def num_of_results_soup01(soup): """ num_of_results using Beautifulsoup """ soup = util.x_to_soup(soup) assert isinstance(soup,BeautifulSoup), "hey, I'm expecting soup!" t = get_section(soup,[{'id':'appbar'},{'id':'topabar'}]) t = re.findall("("+RE_NUMBER+").*?result",t.text) return int(re.sub(RE_NUM_SEP,"",t[-1]))
def num_of_results_soup01(soup): """ num_of_results using Beautifulsoup """ soup = util.x_to_soup(soup) assert isinstance(soup, BeautifulSoup), "hey, I'm expecting soup!" t = get_section(soup, [{'id': 'appbar'}, {'id': 'topabar'}]) t = re.findall("(" + RE_NUMBER + ").*?result", t.text) return int(re.sub(RE_NUM_SEP, "", t[-1]))
def google_webResults_soup(source): """ google_webResults using Beautifulsoup """ soup = util.x_to_soup(source) list_of_li = soup.findAll('li', attrs={'class':'g'}) return [ {'pos':i+1, 'title': li.find('a'), 'text': li.find('span', attrs={'class':'st'})} for (i,li) in enumerate(list_of_li) ]
def get_pois_near_hotel_location(html): html = parse_util.x_to_soup(html) html = html.find('div', attrs={'id': "location-distances"}).renderContents() t = html.split('<br/>') t = ['. '+x for x in t] # print len(t) # return [re.search(pois_near_hotel_exp, x) for x in t] return [x.group(0).strip() for x in [re.search(pois_near_hotel_exp_0, x) for x in t] if x]
def rh_ads(input): """ right hand side ads """ output = [] input = util.x_to_soup(input) rhs_block_element = rhs_block(input) if len(rhs_block_element)!=0: ol_section = rhs_block_element[0].findAll('ol') if len(ol_section): return [str(li) for li in ol_section[0].findAll('li')] # TODO: do we want string, unicode, or the tag itself as is?
def google_webResults_soup(source): """ google_webResults using Beautifulsoup """ soup = util.x_to_soup(source) list_of_li = soup.findAll('li', attrs={'class': 'g'}) return [{ 'pos': i + 1, 'title': li.find('a'), 'text': li.find('span', attrs={'class': 'st'}) } for (i, li) in enumerate(list_of_li)]
def rh_ads(input): """ right hand side ads """ output = [] input = util.x_to_soup(input) rhs_block_element = rhs_block(input) if len(rhs_block_element) != 0: ol_section = rhs_block_element[0].findAll('ol') if len(ol_section): return [ str(li) for li in ol_section[0].findAll('li') ] # TODO: do we want string, unicode, or the tag itself as is?
def element_presence_dict(input): input = putil.x_to_soup(input) html = input.renderContents() text = input.get_text(' ') d = dict() # search_term_redirect (when google shows results for a different search term than the entered--should be in _spell) d['isa_search_term_redirect'] = bool(re.search('(Showing results for)|(Including results for)',text)) # did_you_mean d['did_you_mean'] = bool(re.search('Did you mean',text)) # number of results d['number_of_results'] = re.findall('About ('+RE_NUMBER+') results',text) if len(d['number_of_results']) > 0: d['isa_number_of_results'] = True else: if bool(re.search('Your search .* did not match any documents.')): d['number_of_results'] = 0 else: d['number_of_results'] = -1 # number of center_col_elements center_col = input.find('div',{'id':'center_col'}) if center_col: d['num_of_center_col_children'] = len([x for x in center_col.children]) else: d['num_of_center_col_children'] = 0
def google_light_parse(gresult): gresult = parse_util.x_to_soup(gresult) parse_dict = dict() resultStats = input.find(name='div',attrs={'id':'resultStats'}) if resultStats: parse_dict['_resultStats'] = google.parse_number_of_results(resultStats)
def get_info_dict_01(input): input = putil.x_to_soup(input)
def top_elements(input): """ the top ads, hotel finder and other elements coming before webResults """ input = util.x_to_soup(input) return input.findAll('span', attrs={'id':'taw'})
def root_dict(soup): soup = util.x_to_soup(soup) return extract_tag_dict_from_node(soup, 'root')
def top_elements(input): """ the top ads, hotel finder and other elements coming before webResults """ input = util.x_to_soup(input) return input.findAll('span', attrs={'id': 'taw'})
def mk_gresult_tag_dict(input): """ mk_result_dict(input) takes a soup, html string, or filename of google result htmls as an input and returns a dict containing components of the html we're interested in """ input = util.x_to_soup(input) d = dict() # number of results resultStats = input.find(name='div', attrs={'id': 'resultStats'}) if resultStats: d['_resultStats'] = resultStats # center_col center_col = input.find(name='div', attrs={'id': 'center_col'}) if center_col: d['_center_col'] = center_col # tads tads = d['_center_col'].find(name='div', attrs={'id': 'tads'}) if tads: d['_tads'] = tads # top_ads top_ads = d['_tads'].findAll('li') if top_ads: d['_top_ads_list'] = top_ads # res res = d['_center_col'].find(name='div', attrs={'id': 'res'}) if res: d['_res'] = res # searchInstead topstuff = d['_res'].find(name='div', attrs={'id': 'topstuff'}) if topstuff: d['_topstuff'] = topstuff # spell spell = d['_topstuff'].find(name='a', attrs={'class': 'spell'}) if spell: d['_spell'] = spell # search search = d['_res'].find(name='div', attrs={'id': 'search'}) if search: d['_search'] = search # ires ires = d['_search'].find(name='div', attrs={'id': 'ires'}) if ires: d['_ires'] = ires # organicResults organic_results = d['_ires'].findAll('li') if organic_results: d['_organic_results_list'] = organic_results # related_search after_res = d['_res'].nextSibling if after_res: related_search = after_res.find('table') if related_search: d['_related_search'] = related_search # rhs_block rhs_block = input.find(name='div', attrs={'id': 'rhs_block'}) if rhs_block: d['_rhs_block'] = rhs_block rhs_ads = [] # initializing # rhs_ads from mbEnd mbEnd = d['_rhs_block'].find(name='div', attrs={'id': 'mbEnd'}) if mbEnd: d['_mbEnd'] = mbEnd rhs_ads = rhs_ads + mbEnd.findAll('li') # rhs_ads from nobr nobr = d['_rhs_block'].find(name='ol', attrs={'class': 'nobr'}) if nobr: d['_nobr'] = nobr rhs_ads = rhs_ads + nobr.findAll('li') # puting rhs_ads in the dict if rhs_ads: d['_rhs_ads_list'] = rhs_ads # Okay, no more parsing wishes, return the dict d return d
def mk_gresult_tag_dict(input): """ mk_result_dict(input) takes a soup, html string, or filename of google result htmls as an input and returns a dict containing components of the html we're interested in """ input = putil.x_to_soup(input) d = dict() # number of results resultStats = input.find(name='div',attrs={'id':'resultStats'}) if resultStats: d['_resultStats'] = resultStats # center_col center_col = input.find(name='div',attrs={'id':'center_col'}) if center_col: d['_center_col'] = center_col # tads (center_col.taw.tvcap.tads) tads = d['_center_col'].find(name='div',attrs={'id':'tads'}) if tads: d['_tads'] = tads # top_ads top_ads = d['_tads'].findAll('li') if top_ads: d['_top_ads_list'] = top_ads # c (center_col.taw.tvcap.c) c = d['_center_col'].find(name='div',attrs={'class':'c'}) if c: d['_c'] = c # c_list c_list = d['_c'].findAll('li') if c_list: d['_c_list'] = c_list # res res = d['_center_col'].find(name='div',attrs={'id':'res'}) if res: d['_res'] = res # searchInstead topstuff = d['_res'].find(name='div',attrs={'id':'topstuff'}) if topstuff: d['_topstuff'] = topstuff # used to contain spell, # but then realized the spell appeared in other places some times, so moved it to child of _center_col # spell spell = d['_center_col'].find(name='a',attrs={'class':'spell'}) if spell: d['_spell'] = spell # search search = d['_res'].find(name='div',attrs={'id':'search'}) if search: d['_search'] = search # ires ires = d['_search'].find(name='div',attrs={'id':'ires'}) if ires: d['_ires'] = ires # organicResults organic_results = d['_ires'].findAll('li') if organic_results: d['_organic_results_list'] = organic_results # # related_search # extrares = d['_center_col'].find(name='div',attrs={'id':'extrares'}) # if extrares: # related_search = extrares.find('table') # if related_search: # d['_related_search'] = related_search # related_search after_res = d['_res'].nextSibling if after_res: related_search = after_res.find('table') if related_search: d['_related_search'] = related_search # rhs_block rhs_block = input.find(name='div',attrs={'id':'rhs_block'}) if rhs_block: d['_rhs_block'] = rhs_block # lu_pinned_rhs (where some hotel finder, maps, specific hotels might be) lu_pinned_rhs = d['_rhs_block'].find(name='div',attrs={'id':'lu_pinned_rhs'}) if lu_pinned_rhs: d['_lu_pinned_rhs'] = lu_pinned_rhs # knop (another place where some hotel finder, maps, specific hotels might be) knop = d['_rhs_block'].find(name='div',attrs={'id':'knop'}) if knop: d['_knop'] = knop rhs_ads = [] # initializing # rhs_ads from mbEnd mbEnd = d['_rhs_block'].find(name='div',attrs={'id':'mbEnd'}) if mbEnd: d['_mbEnd'] = mbEnd rhs_ads = rhs_ads + mbEnd.findAll('li') # rhs_ads from nobr nobr = d['_rhs_block'].find(name='ol',attrs={'class':'nobr'}) if nobr: d['_nobr'] = nobr rhs_ads = rhs_ads + nobr.findAll('li') # puting rhs_ads in the dict if rhs_ads: d['_rhs_ads_list'] = rhs_ads # Okay, no more parsing wishes, return the dict d return d
def root_dict(soup): soup = util.x_to_soup(soup) return extract_tag_dict_from_node(soup,'root')
def mk_gresult_tag_dict(input): """ mk_result_dict(input) takes a soup, html string, or filename of google result htmls as an input and returns a dict containing components of the html we're interested in """ input = putil.x_to_soup(input) d = dict() # number of results resultStats = input.find(name='div', attrs={'id': 'resultStats'}) if resultStats: d['_resultStats'] = resultStats # center_col center_col = input.find(name='div', attrs={'id': 'center_col'}) if center_col: d['_center_col'] = center_col # tads (center_col.taw.tvcap.tads) tads = d['_center_col'].find(name='div', attrs={'id': 'tads'}) if tads: d['_tads'] = tads # top_ads top_ads = d['_tads'].findAll('li') if top_ads: d['_top_ads_list'] = top_ads # c (center_col.taw.tvcap.c) c = d['_center_col'].find(name='div', attrs={'class': 'c'}) if c: d['_c'] = c # c_list c_list = d['_c'].findAll('li') if c_list: d['_c_list'] = c_list # res res = d['_center_col'].find(name='div', attrs={'id': 'res'}) if res: d['_res'] = res # searchInstead topstuff = d['_res'].find(name='div', attrs={'id': 'topstuff'}) if topstuff: d['_topstuff'] = topstuff # used to contain spell, # but then realized the spell appeared in other places some times, so moved it to child of _center_col # spell spell = d['_center_col'].find(name='a', attrs={'class': 'spell'}) if spell: d['_spell'] = spell # search search = d['_res'].find(name='div', attrs={'id': 'search'}) if search: d['_search'] = search # ires ires = d['_search'].find(name='div', attrs={'id': 'ires'}) if ires: d['_ires'] = ires # organicResults organic_results = d['_ires'].findAll('li') if organic_results: d['_organic_results_list'] = organic_results # # related_search # extrares = d['_center_col'].find(name='div',attrs={'id':'extrares'}) # if extrares: # related_search = extrares.find('table') # if related_search: # d['_related_search'] = related_search # related_search after_res = d['_res'].nextSibling if after_res: related_search = after_res.find('table') if related_search: d['_related_search'] = related_search # rhs_block rhs_block = input.find(name='div', attrs={'id': 'rhs_block'}) if rhs_block: d['_rhs_block'] = rhs_block # lu_pinned_rhs (where some hotel finder, maps, specific hotels might be) lu_pinned_rhs = d['_rhs_block'].find(name='div', attrs={'id': 'lu_pinned_rhs'}) if lu_pinned_rhs: d['_lu_pinned_rhs'] = lu_pinned_rhs # knop (another place where some hotel finder, maps, specific hotels might be) knop = d['_rhs_block'].find(name='div', attrs={'id': 'knop'}) if knop: d['_knop'] = knop rhs_ads = [] # initializing # rhs_ads from mbEnd mbEnd = d['_rhs_block'].find(name='div', attrs={'id': 'mbEnd'}) if mbEnd: d['_mbEnd'] = mbEnd rhs_ads = rhs_ads + mbEnd.findAll('li') # rhs_ads from nobr nobr = d['_rhs_block'].find(name='ol', attrs={'class': 'nobr'}) if nobr: d['_nobr'] = nobr rhs_ads = rhs_ads + nobr.findAll('li') # puting rhs_ads in the dict if rhs_ads: d['_rhs_ads_list'] = rhs_ads # Okay, no more parsing wishes, return the dict d return d
def parse_hotel_info_page(html): html = parse_util.x_to_soup(html) d = dict() # hotel name d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d, key='hotel_name', name='h1', attrs={'property': 'v:name'}, text_transform=parse_util.strip_spaces) # hotel address tag = html.find(name='p', attrs={'id': 'property-address'}) if tag: d['hotel_address'] = pstr_trans.strip(tag.text) d = parse_bsoup.add_text_to_parse_dict(soup=tag, parse_dict=d, key='hotel_street_address', name='span', attrs={'property': "v:street-address"}, text_transform=parse_util.strip_spaces) d = parse_bsoup.add_text_to_parse_dict(soup=tag, parse_dict=d, key='hotel_locality', name='span', attrs={'property': "v:locality"}, text_transform=parse_util.strip_spaces) # average price d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d, key='currency', name='span', attrs={'id': 'currency-symbol'}, text_transform=parse_util.strip_spaces) avgPriceEl0 = html.find(name='span', attrs={'id': 'avgPriceEl0'}) avgPriceDecimals = html.find(name='sup', attrs={'id': 'avgPriceDecimals'}) if avgPriceEl0: d['average_price'] = avgPriceEl0.text if avgPriceDecimals: d['average_price'] = d['average_price'] + avgPriceDecimals.text d['average_price'] = float(d['average_price']) # facebook likes d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d, key='facebook_likes', name='span', attrs={'class': 'pluginCountTextDisconnected'}, text_transform=float) # num_of_photos tag = html.find(name='div', attrs={'id': 'photo_gallery'}) if tag: d['num_of_photos'] = len(tag.findAll(name='li')) # hotel description d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d, key='hotel_description', name='div', attrs={'id': 'hotel-description-body'}, text_transform=parse_util.strip_spaces) # average_venere_rating tag = html.find(name='div', attrs={'id': 'avg_guest_rating'}) if tag: d['average_venere_rating'] = float(tag.find(name='b', attrs={'property': 'v:rating'}).text) # facilities tag = html.find(name='div', attrs={'id': 'facilities'}) if tag: facilities = tag.findAll(name='li') if facilities: d['facilities'] = [parse_util.strip_spaces(x.text) for x in facilities] # alternate names tag = html.find(name='div', attrs={'id': 'also_known_as'}) if tag: tag = tag.find(name='p') if tag: t = [parse_util.strip_spaces(x) for x in tag.renderContents().split('<br>')] t = [parse_util.strip_tags(x) for x in t] d['alternate_names'] = t # overview_reviews tag = html.find(name='div', attrs={'id': 'reviews-overview-hbar-box'}) if tag: tagg = tag.findAll(name='div', attrs={'class': 'reviews-overview-horizzontalbar'}) if tagg: d['overview_reviews'] = dict() for t in tagg: d['overview_reviews'][t.find(name='p').text] = float(t.find(name='b').text) # location_distances tag = html.find(name='div', attrs={'id': 'location-distances'}) if tag: t = re.sub("^[^<]+<h2>.+</h2>","", tag.renderContents()).split('<br/>') tt = [re.findall(pois_near_hotel_exp, x) for x in t] tt = [x[0] for x in tt if x] d['poi_and_distances'] = [{'poi': parse_util.strip_spaces(x[0].replace('"', '')), 'km': float(x[1]), 'mi': float(x[2])} for x in tt] return d
def mk_gresult_tag_dict(input): """ mk_result_dict(input) takes a soup, html string, or filename of google result htmls as an input and returns a dict containing components of the html we're interested in """ input = util.x_to_soup(input) d = dict() # number of results resultStats = input.find(name='div',attrs={'id':'resultStats'}) if resultStats: d['_resultStats'] = resultStats # center_col center_col = input.find(name='div',attrs={'id':'center_col'}) if center_col: d['_center_col'] = center_col # tads tads = d['_center_col'].find(name='div',attrs={'id':'tads'}) if tads: d['_tads'] = tads # top_ads top_ads = d['_tads'].findAll('li') if top_ads: d['_top_ads_list'] = top_ads # res res = d['_center_col'].find(name='div',attrs={'id':'res'}) if res: d['_res'] = res # searchInstead topstuff = d['_res'].find(name='div',attrs={'id':'topstuff'}) if topstuff: d['_topstuff'] = topstuff # spell spell = d['_topstuff'].find(name='a',attrs={'class':'spell'}) if spell: d['_spell'] = spell # search search = d['_res'].find(name='div',attrs={'id':'search'}) if search: d['_search'] = search # ires ires = d['_search'].find(name='div',attrs={'id':'ires'}) if ires: d['_ires'] = ires # organicResults organic_results = d['_ires'].findAll('li') if organic_results: d['_organic_results_list'] = organic_results # related_search after_res = d['_res'].nextSibling if after_res: related_search = after_res.find('table') if related_search: d['_related_search'] = related_search # rhs_block rhs_block = input.find(name='div',attrs={'id':'rhs_block'}) if rhs_block: d['_rhs_block'] = rhs_block rhs_ads = [] # initializing # rhs_ads from mbEnd mbEnd = d['_rhs_block'].find(name='div',attrs={'id':'mbEnd'}) if mbEnd: d['_mbEnd'] = mbEnd rhs_ads = rhs_ads + mbEnd.findAll('li') # rhs_ads from nobr nobr = d['_rhs_block'].find(name='ol',attrs={'class':'nobr'}) if nobr: d['_nobr'] = nobr rhs_ads = rhs_ads + nobr.findAll('li') # puting rhs_ads in the dict if rhs_ads: d['_rhs_ads_list'] = rhs_ads # Okay, no more parsing wishes, return the dict d return d