def get_link_to_pic_slideshow(): raw_html_pics = simple_get(get_url_to_pics()) html_pics = BeautifulSoup(raw_html_pics, 'html.parser') first_pic_link = html_pics.find("div", class_="photo-box--interactive").a["href"] first_pic_link = 'https://www.yelp.com' + first_pic_link return(first_pic_link)
def get_total_pic_pages(): # kinda hacky rn, maybe check how the javascript or something else # generates the page nums raw_html_pics = simple_get(get_url_to_pics()) html_pics = BeautifulSoup(raw_html_pics, 'html.parser') total_num_html = html_pics.find("div", class_="page-of-pages").text.strip() total_num_html = total_num_html.split()[3] total_num_html = int(total_num_html) return(total_num_html)
def get_pictures(): raw_html_pics = simple_get(get_url_to_pics()) html_pics = BeautifulSoup(raw_html_pics, 'html.parser') list_of_pic_urls = [] # picture_links = html_pics.find_all("img", class_="photo-box-img") # for link in range(len(picture_links)): # picture_links[link] = picture_links[link]["src"] # return(picture_links) # could also probs use li with class 'data-photo-id' for tag in html_pics.find_all("div", class_="photo-box--interactive"): list_of_pic_urls.append(tag.img["src"]) return(list_of_pic_urls)
from bs4 import BeautifulSoup from parser import simple_get # raw_html = open('test.html').read() raw_html = simple_get('https://www.yelp.com/biz/jacks-prime-san-mateo-4?osq=burger') print("done") html = BeautifulSoup(raw_html, 'html.parser') class Business(): pass
def get_total_pics(): raw_html_pics = simple_get(get_url_to_pics()) html_pics = BeautifulSoup(raw_html_pics, 'html.parser') num_of_pics = html_pics.find("span", class_="tab-link_count").text.strip('()') return(int(num_of_pics))