def get_all_urls_celeb_stats(base_url): urls = [] html_all_urls = web_utils.load_html_data_from_url(base_url) soup = BeautifulSoup(html_all_urls) all_urls = soup.findAll('loc') for url in all_urls: if "height-weight-body-statistics" in url.string: urls.append(url.string) return urls
def get_stats_healthyceleb(url): print "Crawling URL .." + url html_content = web_utils.load_html_data_from_url(url) soup = BeautifulSoup(html_content) celeb_stats = {} # first extract the main div containing all the page content page_content = soup.find('div', {'class' : 'page-content'}) # now extract the relevant children : # <h3> represent attributes , <p> represents attribute values children = page_content.findChildren() key, value = EMPTY_STRING, EMPTY_STRING for child in children: if child.name == "h3": key = child.string if child.name == "p" and key != EMPTY_STRING: value = child.string # Value would be NONE for complex tags which have multiple HTML elements if value is None: value = " ".join(child.stripped_strings) # Cleanup the extracted string value = value.strip() value = value.strip('\n') # If value is still empty, it implies probably the data is hidden in image if value is EMPTY_STRING: #print str(child) image = child.find('img') if image is not None: value = image['alt'] else: value = "NA" celeb_stats[key] = value key, value = EMPTY_STRING, EMPTY_STRING return celeb_stats