示例#1
0
def google_light_parse(gresult):
    gresult = parse_util.x_to_soup(gresult)
    parse_dict = dict()
    resultStats = input.find(name='div', attrs={'id': 'resultStats'})
    if resultStats:
        parse_dict['_resultStats'] = google.parse_number_of_results(
            resultStats)
示例#2
0
def element_presence_dict(input):
    input = putil.x_to_soup(input)
    html = input.renderContents()
    text = input.get_text(' ')
    d = dict()
    # search_term_redirect (when google shows results for a different search term than the entered--should be in _spell)
    d['isa_search_term_redirect'] = bool(
        re.search('(Showing results for)|(Including results for)', text))
    # did_you_mean
    d['did_you_mean'] = bool(re.search('Did you mean', text))
    # number of results
    d['number_of_results'] = re.findall('About (' + RE_NUMBER + ') results',
                                        text)
    if len(d['number_of_results']) > 0:
        d['isa_number_of_results'] = True
    else:
        if bool(re.search('Your search .* did not match any documents.')):
            d['number_of_results'] = 0
        else:
            d['number_of_results'] = -1
    # number of center_col_elements
    center_col = input.find('div', {'id': 'center_col'})
    if center_col:
        d['num_of_center_col_children'] = len([x for x in center_col.children])
    else:
        d['num_of_center_col_children'] = 0
示例#3
0
文件: google_bak01.py 项目: yz-/ut
def num_of_results_soup01(soup):
    """
    num_of_results using Beautifulsoup
    """
    soup = util.x_to_soup(soup)
    assert isinstance(soup,BeautifulSoup), "hey, I'm expecting soup!"
    t = get_section(soup,[{'id':'appbar'},{'id':'topabar'}])
    t = re.findall("("+RE_NUMBER+").*?result",t.text)
    return int(re.sub(RE_NUM_SEP,"",t[-1]))
示例#4
0
def num_of_results_soup01(soup):
    """
    num_of_results using Beautifulsoup
    """
    soup = util.x_to_soup(soup)
    assert isinstance(soup, BeautifulSoup), "hey, I'm expecting soup!"
    t = get_section(soup, [{'id': 'appbar'}, {'id': 'topabar'}])
    t = re.findall("(" + RE_NUMBER + ").*?result", t.text)
    return int(re.sub(RE_NUM_SEP, "", t[-1]))
示例#5
0
文件: google_bak01.py 项目: yz-/ut
def google_webResults_soup(source):
    """
    google_webResults using Beautifulsoup
    """
    soup = util.x_to_soup(source)
    list_of_li = soup.findAll('li', attrs={'class':'g'})
    return [
        {'pos':i+1, 'title': li.find('a'), 'text': li.find('span', attrs={'class':'st'})}
        for (i,li) in enumerate(list_of_li)
    ]
示例#6
0
def get_pois_near_hotel_location(html):
    html = parse_util.x_to_soup(html)
    html = html.find('div', attrs={'id': "location-distances"}).renderContents()
    t = html.split('<br/>')
    t = ['. '+x for x in t]
    # print len(t)
    # return [re.search(pois_near_hotel_exp, x) for x in t]
    return [x.group(0).strip() for x in
         [re.search(pois_near_hotel_exp_0, x) for x in t]
         if x]
示例#7
0
文件: google_bak01.py 项目: yz-/ut
def rh_ads(input):
    """
    right hand side ads
    """
    output = []
    input = util.x_to_soup(input)
    rhs_block_element = rhs_block(input)
    if len(rhs_block_element)!=0:
        ol_section = rhs_block_element[0].findAll('ol')
        if len(ol_section):
            return [str(li) for li in ol_section[0].findAll('li')] # TODO: do we want string, unicode, or the tag itself as is?
示例#8
0
def google_webResults_soup(source):
    """
    google_webResults using Beautifulsoup
    """
    soup = util.x_to_soup(source)
    list_of_li = soup.findAll('li', attrs={'class': 'g'})
    return [{
        'pos': i + 1,
        'title': li.find('a'),
        'text': li.find('span', attrs={'class': 'st'})
    } for (i, li) in enumerate(list_of_li)]
示例#9
0
def rh_ads(input):
    """
    right hand side ads
    """
    output = []
    input = util.x_to_soup(input)
    rhs_block_element = rhs_block(input)
    if len(rhs_block_element) != 0:
        ol_section = rhs_block_element[0].findAll('ol')
        if len(ol_section):
            return [
                str(li) for li in ol_section[0].findAll('li')
            ]  # TODO: do we want string, unicode, or the tag itself as is?
示例#10
0
文件: google.py 项目: yz-/ut
def element_presence_dict(input):
    input = putil.x_to_soup(input)
    html = input.renderContents()
    text = input.get_text(' ')
    d = dict()
    # search_term_redirect (when google shows results for a different search term than the entered--should be in _spell)
    d['isa_search_term_redirect'] = bool(re.search('(Showing results for)|(Including results for)',text))
    # did_you_mean
    d['did_you_mean'] = bool(re.search('Did you mean',text))
    # number of results
    d['number_of_results'] = re.findall('About ('+RE_NUMBER+') results',text)
    if len(d['number_of_results']) > 0:
        d['isa_number_of_results'] = True
    else:
        if bool(re.search('Your search .* did not match any documents.')):
            d['number_of_results'] = 0
        else:
            d['number_of_results'] = -1
    # number of center_col_elements
    center_col = input.find('div',{'id':'center_col'})
    if center_col:
        d['num_of_center_col_children'] = len([x for x in center_col.children])
    else:
        d['num_of_center_col_children'] = 0
示例#11
0
文件: khan01_spike.py 项目: yz-/ut
def google_light_parse(gresult):
    gresult = parse_util.x_to_soup(gresult)
    parse_dict = dict()
    resultStats = input.find(name='div',attrs={'id':'resultStats'})
    if resultStats:
        parse_dict['_resultStats'] = google.parse_number_of_results(resultStats)
示例#12
0
def get_info_dict_01(input):
    input = putil.x_to_soup(input)
示例#13
0
文件: google_bak01.py 项目: yz-/ut
def top_elements(input):
    """
    the top ads, hotel finder and other elements coming before webResults
    """
    input = util.x_to_soup(input)
    return input.findAll('span', attrs={'id':'taw'})
示例#14
0
def root_dict(soup):
    soup = util.x_to_soup(soup)
    return extract_tag_dict_from_node(soup, 'root')
示例#15
0
def top_elements(input):
    """
    the top ads, hotel finder and other elements coming before webResults
    """
    input = util.x_to_soup(input)
    return input.findAll('span', attrs={'id': 'taw'})
示例#16
0
def mk_gresult_tag_dict(input):
    """
    mk_result_dict(input)
    takes a soup, html string, or filename of google result htmls as an input
    and returns a dict containing components of the html we're interested in
    """
    input = util.x_to_soup(input)
    d = dict()

    # number of results
    resultStats = input.find(name='div', attrs={'id': 'resultStats'})
    if resultStats: d['_resultStats'] = resultStats

    # center_col
    center_col = input.find(name='div', attrs={'id': 'center_col'})
    if center_col:
        d['_center_col'] = center_col
        # tads
        tads = d['_center_col'].find(name='div', attrs={'id': 'tads'})
        if tads:
            d['_tads'] = tads
            # top_ads
            top_ads = d['_tads'].findAll('li')
            if top_ads:
                d['_top_ads_list'] = top_ads
        # res
        res = d['_center_col'].find(name='div', attrs={'id': 'res'})
        if res:
            d['_res'] = res
            # searchInstead
            topstuff = d['_res'].find(name='div', attrs={'id': 'topstuff'})
            if topstuff:
                d['_topstuff'] = topstuff
                # spell
                spell = d['_topstuff'].find(name='a', attrs={'class': 'spell'})
                if spell: d['_spell'] = spell
            # search
            search = d['_res'].find(name='div', attrs={'id': 'search'})
            if search:
                d['_search'] = search
                # ires
                ires = d['_search'].find(name='div', attrs={'id': 'ires'})
                if ires:
                    d['_ires'] = ires
                    # organicResults
                    organic_results = d['_ires'].findAll('li')
                    if organic_results:
                        d['_organic_results_list'] = organic_results

        # related_search
        after_res = d['_res'].nextSibling
        if after_res:
            related_search = after_res.find('table')
            if related_search:
                d['_related_search'] = related_search

    # rhs_block
    rhs_block = input.find(name='div', attrs={'id': 'rhs_block'})
    if rhs_block:
        d['_rhs_block'] = rhs_block
        rhs_ads = []  # initializing
        # rhs_ads from mbEnd
        mbEnd = d['_rhs_block'].find(name='div', attrs={'id': 'mbEnd'})
        if mbEnd:
            d['_mbEnd'] = mbEnd
            rhs_ads = rhs_ads + mbEnd.findAll('li')
        # rhs_ads from nobr
        nobr = d['_rhs_block'].find(name='ol', attrs={'class': 'nobr'})
        if nobr:
            d['_nobr'] = nobr
            rhs_ads = rhs_ads + nobr.findAll('li')
            # puting rhs_ads in the dict
        if rhs_ads:
            d['_rhs_ads_list'] = rhs_ads
    # Okay, no more parsing wishes, return the dict d
    return d
示例#17
0
文件: google.py 项目: yz-/ut
def mk_gresult_tag_dict(input):
    """
    mk_result_dict(input)
    takes a soup, html string, or filename of google result htmls as an input
    and returns a dict containing components of the html we're interested in
    """
    input = putil.x_to_soup(input)
    d = dict()

    # number of results
    resultStats = input.find(name='div',attrs={'id':'resultStats'})
    if resultStats: d['_resultStats'] = resultStats

    # center_col
    center_col = input.find(name='div',attrs={'id':'center_col'})
    if center_col:
        d['_center_col'] = center_col
        # tads   (center_col.taw.tvcap.tads)
        tads = d['_center_col'].find(name='div',attrs={'id':'tads'})
        if tads:
            d['_tads'] = tads
            # top_ads
            top_ads = d['_tads'].findAll('li')
            if top_ads:
                d['_top_ads_list'] = top_ads
        # c   (center_col.taw.tvcap.c)
        c = d['_center_col'].find(name='div',attrs={'class':'c'})
        if c:
            d['_c'] = c
            # c_list
            c_list = d['_c'].findAll('li')
            if c_list:
                d['_c_list'] = c_list
        # res
        res = d['_center_col'].find(name='div',attrs={'id':'res'})
        if res:
            d['_res'] = res
            # searchInstead
            topstuff = d['_res'].find(name='div',attrs={'id':'topstuff'})
            if topstuff:
                d['_topstuff'] = topstuff # used to contain spell,
                # but then realized the spell appeared in other places some times, so moved it to child of _center_col
            # spell
            spell = d['_center_col'].find(name='a',attrs={'class':'spell'})
            if spell:
                d['_spell'] = spell
            # search
            search = d['_res'].find(name='div',attrs={'id':'search'})
            if search:
                d['_search'] = search
                # ires
                ires = d['_search'].find(name='div',attrs={'id':'ires'})
                if ires:
                    d['_ires'] = ires
                    # organicResults
                    organic_results = d['_ires'].findAll('li')
                    if organic_results:
                        d['_organic_results_list'] = organic_results

        # # related_search
        # extrares = d['_center_col'].find(name='div',attrs={'id':'extrares'})
        # if extrares:
        #     related_search = extrares.find('table')
        #     if related_search:
        #         d['_related_search'] = related_search

        # related_search
        after_res = d['_res'].nextSibling
        if after_res:
            related_search = after_res.find('table')
            if related_search:
                d['_related_search'] = related_search

    # rhs_block
    rhs_block = input.find(name='div',attrs={'id':'rhs_block'})
    if rhs_block:
        d['_rhs_block'] = rhs_block
        # lu_pinned_rhs (where some hotel finder, maps, specific hotels might be)
        lu_pinned_rhs = d['_rhs_block'].find(name='div',attrs={'id':'lu_pinned_rhs'})
        if lu_pinned_rhs:
            d['_lu_pinned_rhs'] = lu_pinned_rhs
        # knop (another place where some hotel finder, maps, specific hotels might be)
        knop = d['_rhs_block'].find(name='div',attrs={'id':'knop'})
        if knop:
            d['_knop'] = knop
        rhs_ads = [] # initializing
        # rhs_ads from mbEnd
        mbEnd = d['_rhs_block'].find(name='div',attrs={'id':'mbEnd'})
        if mbEnd:
            d['_mbEnd'] = mbEnd
            rhs_ads = rhs_ads + mbEnd.findAll('li')
        # rhs_ads from nobr
        nobr = d['_rhs_block'].find(name='ol',attrs={'class':'nobr'})
        if nobr:
            d['_nobr'] = nobr
            rhs_ads = rhs_ads + nobr.findAll('li')
            # puting rhs_ads in the dict
        if rhs_ads:
            d['_rhs_ads_list'] = rhs_ads
    # Okay, no more parsing wishes, return the dict d
    return d
示例#18
0
文件: google.py 项目: yz-/ut
def get_info_dict_01(input):
    input = putil.x_to_soup(input)
示例#19
0
文件: google_bak01.py 项目: yz-/ut
def root_dict(soup):
    soup = util.x_to_soup(soup)
    return extract_tag_dict_from_node(soup,'root')
示例#20
0
def mk_gresult_tag_dict(input):
    """
    mk_result_dict(input)
    takes a soup, html string, or filename of google result htmls as an input
    and returns a dict containing components of the html we're interested in
    """
    input = putil.x_to_soup(input)
    d = dict()

    # number of results
    resultStats = input.find(name='div', attrs={'id': 'resultStats'})
    if resultStats: d['_resultStats'] = resultStats

    # center_col
    center_col = input.find(name='div', attrs={'id': 'center_col'})
    if center_col:
        d['_center_col'] = center_col
        # tads   (center_col.taw.tvcap.tads)
        tads = d['_center_col'].find(name='div', attrs={'id': 'tads'})
        if tads:
            d['_tads'] = tads
            # top_ads
            top_ads = d['_tads'].findAll('li')
            if top_ads:
                d['_top_ads_list'] = top_ads
        # c   (center_col.taw.tvcap.c)
        c = d['_center_col'].find(name='div', attrs={'class': 'c'})
        if c:
            d['_c'] = c
            # c_list
            c_list = d['_c'].findAll('li')
            if c_list:
                d['_c_list'] = c_list
        # res
        res = d['_center_col'].find(name='div', attrs={'id': 'res'})
        if res:
            d['_res'] = res
            # searchInstead
            topstuff = d['_res'].find(name='div', attrs={'id': 'topstuff'})
            if topstuff:
                d['_topstuff'] = topstuff  # used to contain spell,
                # but then realized the spell appeared in other places some times, so moved it to child of _center_col
            # spell
            spell = d['_center_col'].find(name='a', attrs={'class': 'spell'})
            if spell:
                d['_spell'] = spell
            # search
            search = d['_res'].find(name='div', attrs={'id': 'search'})
            if search:
                d['_search'] = search
                # ires
                ires = d['_search'].find(name='div', attrs={'id': 'ires'})
                if ires:
                    d['_ires'] = ires
                    # organicResults
                    organic_results = d['_ires'].findAll('li')
                    if organic_results:
                        d['_organic_results_list'] = organic_results

        # # related_search
        # extrares = d['_center_col'].find(name='div',attrs={'id':'extrares'})
        # if extrares:
        #     related_search = extrares.find('table')
        #     if related_search:
        #         d['_related_search'] = related_search

        # related_search
        after_res = d['_res'].nextSibling
        if after_res:
            related_search = after_res.find('table')
            if related_search:
                d['_related_search'] = related_search

    # rhs_block
    rhs_block = input.find(name='div', attrs={'id': 'rhs_block'})
    if rhs_block:
        d['_rhs_block'] = rhs_block
        # lu_pinned_rhs (where some hotel finder, maps, specific hotels might be)
        lu_pinned_rhs = d['_rhs_block'].find(name='div',
                                             attrs={'id': 'lu_pinned_rhs'})
        if lu_pinned_rhs:
            d['_lu_pinned_rhs'] = lu_pinned_rhs
        # knop (another place where some hotel finder, maps, specific hotels might be)
        knop = d['_rhs_block'].find(name='div', attrs={'id': 'knop'})
        if knop:
            d['_knop'] = knop
        rhs_ads = []  # initializing
        # rhs_ads from mbEnd
        mbEnd = d['_rhs_block'].find(name='div', attrs={'id': 'mbEnd'})
        if mbEnd:
            d['_mbEnd'] = mbEnd
            rhs_ads = rhs_ads + mbEnd.findAll('li')
        # rhs_ads from nobr
        nobr = d['_rhs_block'].find(name='ol', attrs={'class': 'nobr'})
        if nobr:
            d['_nobr'] = nobr
            rhs_ads = rhs_ads + nobr.findAll('li')
            # puting rhs_ads in the dict
        if rhs_ads:
            d['_rhs_ads_list'] = rhs_ads
    # Okay, no more parsing wishes, return the dict d
    return d
示例#21
0
def parse_hotel_info_page(html):
    html = parse_util.x_to_soup(html)
    d = dict()

    # hotel name
    d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d,
                key='hotel_name', name='h1', attrs={'property': 'v:name'}, text_transform=parse_util.strip_spaces)
    # hotel address
    tag = html.find(name='p', attrs={'id': 'property-address'})
    if tag:
        d['hotel_address'] = pstr_trans.strip(tag.text)
        d = parse_bsoup.add_text_to_parse_dict(soup=tag, parse_dict=d,
                key='hotel_street_address', name='span', attrs={'property': "v:street-address"},
                text_transform=parse_util.strip_spaces)
        d = parse_bsoup.add_text_to_parse_dict(soup=tag, parse_dict=d,
                key='hotel_locality', name='span', attrs={'property': "v:locality"},
                text_transform=parse_util.strip_spaces)

    # average price
    d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d,
                key='currency', name='span', attrs={'id': 'currency-symbol'}, text_transform=parse_util.strip_spaces)
    avgPriceEl0 = html.find(name='span', attrs={'id': 'avgPriceEl0'})
    avgPriceDecimals = html.find(name='sup', attrs={'id': 'avgPriceDecimals'})
    if avgPriceEl0:
        d['average_price'] = avgPriceEl0.text
        if avgPriceDecimals:
            d['average_price'] = d['average_price'] + avgPriceDecimals.text
        d['average_price'] = float(d['average_price'])

    # facebook likes
    d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d,
                key='facebook_likes', name='span', attrs={'class': 'pluginCountTextDisconnected'}, text_transform=float)

    # num_of_photos
    tag = html.find(name='div', attrs={'id': 'photo_gallery'})
    if tag:
        d['num_of_photos'] = len(tag.findAll(name='li'))

    # hotel description
    d = parse_bsoup.add_text_to_parse_dict(soup=html, parse_dict=d,
                key='hotel_description', name='div', attrs={'id': 'hotel-description-body'}, text_transform=parse_util.strip_spaces)

    # average_venere_rating
    tag = html.find(name='div', attrs={'id': 'avg_guest_rating'})
    if tag:
        d['average_venere_rating'] = float(tag.find(name='b', attrs={'property': 'v:rating'}).text)

    # facilities
    tag = html.find(name='div', attrs={'id': 'facilities'})
    if tag:
        facilities = tag.findAll(name='li')
        if facilities:
            d['facilities'] = [parse_util.strip_spaces(x.text) for x in facilities]

    # alternate names
    tag = html.find(name='div', attrs={'id': 'also_known_as'})
    if tag:
        tag = tag.find(name='p')
        if tag:
            t = [parse_util.strip_spaces(x) for x in tag.renderContents().split('<br>')]
            t = [parse_util.strip_tags(x) for x in t]
            d['alternate_names'] = t

    # overview_reviews
    tag = html.find(name='div', attrs={'id': 'reviews-overview-hbar-box'})
    if tag:
        tagg = tag.findAll(name='div', attrs={'class': 'reviews-overview-horizzontalbar'})
        if tagg:
            d['overview_reviews'] = dict()
            for t in tagg:
                d['overview_reviews'][t.find(name='p').text] = float(t.find(name='b').text)

    # location_distances
    tag = html.find(name='div', attrs={'id': 'location-distances'})
    if tag:
        t = re.sub("^[^<]+<h2>.+</h2>","", tag.renderContents()).split('<br/>')
        tt = [re.findall(pois_near_hotel_exp, x) for x in t]
        tt = [x[0] for x in tt if x]
        d['poi_and_distances'] = [{'poi': parse_util.strip_spaces(x[0].replace('"', '')), 'km': float(x[1]), 'mi': float(x[2])} for x in tt]
    return d
示例#22
0
文件: google_bak01.py 项目: yz-/ut
def mk_gresult_tag_dict(input):
    """
    mk_result_dict(input)
    takes a soup, html string, or filename of google result htmls as an input
    and returns a dict containing components of the html we're interested in
    """
    input = util.x_to_soup(input)
    d = dict()

    # number of results
    resultStats = input.find(name='div',attrs={'id':'resultStats'})
    if resultStats: d['_resultStats'] = resultStats

    # center_col
    center_col = input.find(name='div',attrs={'id':'center_col'})
    if center_col:
        d['_center_col'] = center_col
        # tads
        tads = d['_center_col'].find(name='div',attrs={'id':'tads'})
        if tads:
            d['_tads'] = tads
            # top_ads
            top_ads = d['_tads'].findAll('li')
            if top_ads:
                d['_top_ads_list'] = top_ads
        # res
        res = d['_center_col'].find(name='div',attrs={'id':'res'})
        if res:
            d['_res'] = res
            # searchInstead
            topstuff = d['_res'].find(name='div',attrs={'id':'topstuff'})
            if topstuff:
                d['_topstuff'] = topstuff
                # spell
                spell = d['_topstuff'].find(name='a',attrs={'class':'spell'})
                if spell: d['_spell'] = spell
            # search
            search = d['_res'].find(name='div',attrs={'id':'search'})
            if search:
                d['_search'] = search
                # ires
                ires = d['_search'].find(name='div',attrs={'id':'ires'})
                if ires:
                    d['_ires'] = ires
                    # organicResults
                    organic_results = d['_ires'].findAll('li')
                    if organic_results:
                        d['_organic_results_list'] = organic_results

        # related_search
        after_res = d['_res'].nextSibling
        if after_res:
            related_search = after_res.find('table')
            if related_search:
                d['_related_search'] = related_search

    # rhs_block
    rhs_block = input.find(name='div',attrs={'id':'rhs_block'})
    if rhs_block:
        d['_rhs_block'] = rhs_block
        rhs_ads = [] # initializing
        # rhs_ads from mbEnd
        mbEnd = d['_rhs_block'].find(name='div',attrs={'id':'mbEnd'})
        if mbEnd:
            d['_mbEnd'] = mbEnd
            rhs_ads = rhs_ads + mbEnd.findAll('li')
        # rhs_ads from nobr
        nobr = d['_rhs_block'].find(name='ol',attrs={'class':'nobr'})
        if nobr:
            d['_nobr'] = nobr
            rhs_ads = rhs_ads + nobr.findAll('li')
            # puting rhs_ads in the dict
        if rhs_ads:
            d['_rhs_ads_list'] = rhs_ads
    # Okay, no more parsing wishes, return the dict d
    return d