def __init__(self, page_url): response = None self.url = None try: response = self.get_response_object(page_url) except Exception: try: # time.sleep(1) response = self.get_response_object(page_url) except Exception: try: # time.sleep(5) response = self.get_response_object(page_url) except Exception: print "\nTried 3 times. Cannot access url: %s. \nHence, cannot make HTML_page_Obj\n" % page_url if response != None: self.url = page_url try: self.short_url = response.headers.getparam('Link') except Exception: self.short_url = "" self.charset = response.headers.getparam('charset') self.headers = {'charset': self.charset} for i in response.headers: self.headers[i] = response.headers[i].split(';')[0] self.html = response.read() #the actual html self.html = text_manip.ensure_UTF8(self.html) self.html_soup = None self.link_dict = None self.all_style = None
def __init__(self, wikipedia_image_url=None, direct_url=None): self.direct_image_link = None self.saved_path = None self.saved_filename = None if direct_url is not None: self.direct_image_link = direct_url elif wikipedia_image_url is not None: img_page = HTML_page_Obj(wikipedia_image_url) img_soup = img_page.make_soup() # print img_soup.find('div', _class="mw-filepage-resolutioninfo") self.main_img_link = img_soup.find( 'div', class_="fullMedia").find( 'a', class_="internal").get('href') if self.main_img_link[:9] == "//upload.": self.main_img_link = "https:" + self.main_img_link img_links_soup = img_soup.find( "div", class_="mw-filepage-resolutioninfo") self.img_px_to_links_map = { } #maps the product of height and width to tuples of pixel dimensions and their links for img_link in img_links_soup.find_all('a'): actual_link = img_link.get('href') if actual_link[:9] == "//upload.": actual_link = "https:" + actual_link img_link_text = text_manip.ensure_UTF8( re.sub(",", "", img_link.get_text())) # print text_manip.ensure_ASCII("\n\nimg_link_text: %s"%img_link_text) img_dimensions = re.findall('\d+', img_link_text) width = int(img_dimensions[0]) height = int(img_dimensions[1]) if len(img_dimensions) == 2: self.img_px_to_links_map[width * height] = ( actual_link, width, height) #does mapping else: print "Cannot use image %s, which is at %s with dimensions %s" % ( wikipedia_image_url, img_link.get('href'), img_dimensions)
def html_soup_and_prettify(self): if self.html_soup == None: self.html_soup = self.make_soup() self.html = text_manip.ensure_UTF8(self.html_soup.prettify()) return self.html_soup
def __init__(self, wikipedia_url, just_heading=False): # inherit items from the HTML_page_Obj: self._page = HTML_page_Obj(wikipedia_url) self.url = self._page.url self.wiki_domain = text_manip.extract_website( self.url) #gets en.wikipedia.org, etc. in different languages # make the soup of the HTML and get the HTML, prettified: soup = self._page.html_soup_and_prettify() self.html = self._page.html #prettified, in UTF-8 format # Get the heading of the Wikipedia page: self.heading = text_manip.ensure_UTF8( BeautifulSoup('''%s''' % soup.find( "h1", id="firstHeading", class_="firstHeading")).get_text().strip()) if just_heading == False: # prevents a lot of unecessary processing if True # Make a soup of the article html, set the article html: article_soup = soup.find( "div", id="mw-content-text") #might come in handy later self.article_html = text_manip.ensure_UTF8(article_soup.prettify()) temp_art_html = self.article_html temp_art_html = text_manip.remove_HTML_perfect( html=temp_art_html, tag="div", class_list=["reflist"]) temp_art_html = text_manip.remove_after( regex='<span.*?id="References"', text=temp_art_html) temp_art_html = text_manip.remove_after( regex='<span.*?id="See_also"', text=temp_art_html) temp_art_html = text_manip.remove_after( regex='<span.*?id="Further_reading"', text=temp_art_html) temp_art_html = text_manip.remove_after( regex='<span.*?id="External_links"', text=temp_art_html) article_soup = BeautifulSoup(temp_art_html) # Get the links from the article HTML: self.article_links = [] # real links, from article self.img_links = [] # real direct links, from article self.direct_img_to_img_map = {} self.media_links = [] # real links, from article not_allowed_in_url_from_article = [ "/wiki/Special:", "/wiki/Template:", "/wiki/Portal:", "/wiki/Wikipedia:", "/wiki/Help:", "redlink=1" ] for link_tag in article_soup.find_all('a'): link = link_tag.get('href') if link is not None and link[:6] == "/wiki/": #only extracts links to wikipedia allowed_flag = True for allowed_test in not_allowed_in_url_from_article: if allowed_test in link: allowed_flag = False break if allowed_flag: link = text_manip.ensure_UTF8(link) mapped_link = "https://" + self.wiki_domain + link if link[:11] != "/wiki/File:": self.article_links.append(mapped_link) else: if link[-4:] == '.ogg': #video or audio file self.media_links.append(mapped_link) else: # image file img_tag = link_tag.img # print "\n\n" # print "link_tag :",link_tag # print "\timg_tag :",img_tag if img_tag is not None: direct_img_link = img_tag.get('src') # print "\t\tdirect_img_link :",direct_img_link self.img_links.append(direct_img_link) self.direct_img_to_img_map[ direct_img_link] = mapped_link self.article_links = list(set(self.article_links)) self.img_links = list(set(self.img_links)) self.media_links = list(set(self.media_links)) # replace the /wiki/ links for actual links. Presetve the old html, and use replaced_html from now on self.replaced_html = self.html for link_tag in soup.find_all('a'): link = link_tag.get('href') if link is not None: if link[:6] == "/wiki/": mapped_link = "https://" + self.wiki_domain + link self.replaced_html = self.replaced_html.replace( 'href="%s' % link, 'href="%s' % mapped_link)