def _get_main_image_dict(self, html_string): """ Try to find the main image in the given html string """ # Transform the html string into an lxml tree doc = build_doc(html_string) # Build list of img tags tags = [] for t in self.tags(doc, 'img'): tags.append(t) # Convert image tags into informational dicts images = [self._img_tag_to_dict(t) for t in tags] # If we have no images we return an empty dict if not images: return {} # If there is only one image then we will use it as the main image if len(images) == 1: return images[0] # If we make it here then we have more than 1 image. We will return the # largest image. largest_pix_area = 0 largest_image_dict = {} for i in images: if i['pix-area'] > largest_pix_area: largest_image_dict = i largest_pix_area = i['pix-area'] return largest_image_dict
def _get_main_image_dict(self, html_string): """ Try to find the main image in the given html string """ # Transform the html string into an lxml tree doc = build_doc(html_string) # Build list of img tags tags = [] for t in self.tags(doc, 'img'): tags.append(t) # Convert image tags into informational dicts images = [self._img_tag_to_dict(t) for t in tags] # If we have no images we return an empty dict if not images: return {} # If there is only one image then we will use it as the main image if len(images) == 1: return images[0] # If we make it here then we have more than 1 image. We will return the # largest image. largest_pix_area = 0 largest_image_dict = {} for i in images: if i['pix-area'] > largest_pix_area: largest_image_dict = i return largest_image_dict
def _parse(self, input): doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options.get("url", None) if base_href: doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def _parse(self, input): doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options.get('url', None) if base_href: doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def _parse(self, input): doc = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options['url'] if base_href: doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def process_url(f,url,partial_html=False,keep_going=True): text_input = f.read() clean_doc = Document(text_input, debug=options.verbose, url=url) clean_doc.summary(partial_html=partial_html,encoding=enc) dirty_doc = build_doc(text_input) del text_input if keep_going: try: next_page = urlparse.urljoin(url,find_next_page(dirty_doc)[0]) if next_page not in visited_urls: if options.verbose: sys.stderr.write(next_page+"\n") if clean_doc.html.tag == 'html': clean_doc.html[0][-1].append(process_url(urllib.urlopen(next_page),next_page,partial_html=True).html[0]) else: clean_doc.html.append(process_url(urllib.urlopen(next_page),next_page,partial_html=True).html[0]) except IndexError: pass else: visited_urls.insert([next_page]) return clean_doc
def __init__(self, url, text=None, page=1, min_article_length=250, min_article_percentage=0.075): """ :param url: the url of the document :param text: optionally the string value of the page may be passed in :param page: if this is one in a series of documents in an article this should be set :param min_article_length: if an article is less than this number of characters it's not an article :param min_article_percentage: an article must be this % of the text on the page """ self.url = url self.page = page self._article = None self.min_article_length = min_article_length self.min_article_percentage = min_article_percentage if text: self.text = text else: self.text = requests.get(url).text # parses the HTML and cleans it up removing elements this doesn't want to deal with (e.g., head, script, form) doc, self.encoding = build_doc(self.text) doc = html_cleaner.clean_html(doc) doc.make_links_absolute(self.url, resolve_base_href=True) self.html = doc