예제 #1
0
    def _get_main_image_dict(self, html_string):
        """ Try to find the main image in the given html string """

        # Transform the html string into an lxml tree
        doc = build_doc(html_string)

        # Build list of img tags
        tags = []
        for t in self.tags(doc, 'img'):
            tags.append(t)

        # Convert image tags into informational dicts
        images = [self._img_tag_to_dict(t) for t in tags]

        # If we have no images we return an empty dict
        if not images:
            return {}

        # If there is only one image then we will use it as the main image
        if len(images) == 1:
            return images[0]

        # If we make it here then we have more than 1 image. We will return the
        # largest image.
        largest_pix_area = 0
        largest_image_dict = {}
        for i in images:
            if i['pix-area'] > largest_pix_area:
                largest_image_dict = i
                largest_pix_area = i['pix-area']

        return largest_image_dict
예제 #2
0
    def _get_main_image_dict(self, html_string):
        """ Try to find the main image in the given html string """

        # Transform the html string into an lxml tree
        doc = build_doc(html_string)

        # Build list of img tags
        tags = []
        for t in self.tags(doc, 'img'):
            tags.append(t)

        # Convert image tags into informational dicts
        images = [self._img_tag_to_dict(t) for t in tags]

        # If we have no images we return an empty dict
        if not images:
            return {}

        # If there is only one image then we will use it as the main image
        if len(images) == 1:
            return images[0]

        # If we make it here then we have more than 1 image. We will return the
        # largest image.
        largest_pix_area = 0
        largest_image_dict = {}
        for i in images:
            if i['pix-area'] > largest_pix_area:
                largest_image_dict = i

        return largest_image_dict
예제 #3
0
 def _parse(self, input):
     doc, self.encoding = build_doc(input)
     doc = html_cleaner.clean_html(doc)
     base_href = self.options.get("url", None)
     if base_href:
         doc.make_links_absolute(base_href, resolve_base_href=True)
     else:
         doc.resolve_base_href()
     return doc
예제 #4
0
 def _parse(self, input):
     doc, self.encoding = build_doc(input)
     doc = html_cleaner.clean_html(doc)
     base_href = self.options.get('url', None)
     if base_href:
         doc.make_links_absolute(base_href, resolve_base_href=True)
     else:
         doc.resolve_base_href()
     return doc
예제 #5
0
	def _parse(self, input):
		doc = build_doc(input)
		doc = html_cleaner.clean_html(doc)
		base_href = self.options['url']
		if base_href:
			doc.make_links_absolute(base_href, resolve_base_href=True)
		else:
			doc.resolve_base_href()
		return doc
예제 #6
0
 def process_url(f,url,partial_html=False,keep_going=True):
     text_input = f.read()
     clean_doc = Document(text_input,
         debug=options.verbose,
         url=url)
     clean_doc.summary(partial_html=partial_html,encoding=enc)
     dirty_doc = build_doc(text_input)
     del text_input
     if keep_going:
         try:
             next_page = urlparse.urljoin(url,find_next_page(dirty_doc)[0])
             if next_page not in visited_urls:
                 if options.verbose: sys.stderr.write(next_page+"\n")
                 if clean_doc.html.tag == 'html':
                     clean_doc.html[0][-1].append(process_url(urllib.urlopen(next_page),next_page,partial_html=True).html[0])
                 else:
                     clean_doc.html.append(process_url(urllib.urlopen(next_page),next_page,partial_html=True).html[0])
         except IndexError:
             pass
         else:
             visited_urls.insert([next_page])
     return clean_doc
예제 #7
0
    def __init__(self, url, text=None, page=1, min_article_length=250, min_article_percentage=0.075):
        """
        :param url: the url of the document
        :param text: optionally the string value of the page may be passed in
        :param page: if this is one in a series of documents in an article this should be set
        :param min_article_length: if an article is less than this number of characters it's not an article
        :param min_article_percentage: an article must be this % of the text on the page
        """
        self.url = url
        self.page = page
        self._article = None
        self.min_article_length = min_article_length
        self.min_article_percentage = min_article_percentage

        if text:
            self.text = text
        else:
            self.text = requests.get(url).text

        # parses the HTML and cleans it up removing elements this doesn't want to deal with (e.g., head, script, form)
        doc, self.encoding = build_doc(self.text)
        doc = html_cleaner.clean_html(doc)
        doc.make_links_absolute(self.url, resolve_base_href=True)
        self.html = doc