def get_first_three(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 3:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != ' ':
                        td.name = "span"
                        if first == 1:
                            first = 0
                            enclose.append(td)
                        else:
                            if loop != 2: td.append(' - ')
                            li.append(td)
                except:
                    pass
            else:
                break    
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    title = enclose.find("span")
    enclose.find("span").replaceWith("")
    enclose.name = "ul"
    div = Tag(soup, "div")
    div.append(title)
    div.append(enclose)
    return div
def get_first_three(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 3:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != ' ':
                        td.name = "span"
                        if first == 1:
                            first = 0
                            enclose.append(td)
                        else:
                            if loop != 2: td.append(' - ')
                            li.append(td)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    title = enclose.find("span")
    enclose.find("span").replaceWith("")
    enclose.name = "ul"
    div = Tag(soup, "div")
    div.append(title)
    div.append(enclose)
    return div
def linearize_rows_1_cols(soup, table):
    if table.get('id') == "linearize-rows-1-cols":
        div = Tag(soup, "div")
        div["class"] = "center"
        for tr in table.findAll("tr"):
            lista = tr.findAll("td")
            li = Tag(soup, "li")
            for td in lista:
                for p in td.findAll("p"):
                    p.name = "span"
                td.name = "span"
                if td == lista[0]:
                    td = BeautifulSoup('<b>' + td.prettify() + '</b>')
                else:
                    td = BeautifulSoup('<span>[</span>' + td.prettify() + '<span>]</span>')
                li.append(td)
            div.append(li)
        div.name = "ul"
        table.replaceWith(div)
Exemplo n.º 4
0
def linearize_rows_1_cols(soup, table):
    if table.get('id') == "linearize-rows-1-cols":
        div = Tag(soup, "div")
        div["class"] = "center"
        for tr in table.findAll("tr"):
            lista = tr.findAll("td")
            li = Tag(soup, "li")
            for td in lista:
                for p in td.findAll("p"):
                    p.name = "span"
                td.name = "span"
                if td == lista[0]:
                    td = BeautifulSoup('<b>' + td.prettify() + '</b>')
                else:
                    td = BeautifulSoup('<span>[</span>' + td.prettify() + '<span>]</span>')
                li.append(td)
            div.append(li)
        div.name = "ul"
        table.replaceWith(div)
Exemplo n.º 5
0
	def parse_html(self):
		title = None
		body = None
		bodysoup = None

		nav_title = None
		meta_title = None
		content_title = None
		
		self.attachments = []

		if (not self.debug_path or self.debug_path == self.path):

			# Remove nbsps
			self.html = self.html.replace("&nbsp;", "")
			
			# Remove attributes from closing tags (!)
			self.html = re.sub(r"</([a-zA-Z]+) [^>]+>", r"</\1>", self.html)
			
			m = RE_BODY.search(self.html)
			if m and m.lastindex == 1:
				bodysoup = BeautifulSoup(m.group(1), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
			else:
				try:
					bodysoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body
					if not bodysoup:
						fixed_html = self.html.replace("</head>", "</head><body>")
						bodysoup = BeautifulSoup(fixed_html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body
				except AttributeError:
					pass
	
			if not bodysoup:
				raise ImportError("No body")
	
			if self.debug_path == self.path:
				print "\n\n========= DEBUG =========\n"

			# Remove comments
			for comment in (bodysoup.findAll(text=lambda text:isinstance(text, Comment))):
				self.debug("Removed comment: <!-- %s -->" % comment)
				comment.extract()
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Convert header divs into h1, h2
			h1_found = False
			for tag in bodysoup.findAll("div"):
				if tag.get("class") == "BL-otsikko1" and not h1_found:
					h1_found = True
					tag.name = "h1"
					self.debug("Converted into H1: %s" % tag)
				elif tag.get("class") == "BL-otsikko2" or \
					(tag.get("class") == "BL-otsikko1" and h1_found):
					tag.name = "h2"
					self.debug("Converted into H2: %s" % tag)
				elif tag.get("class") == "BL-leivanmurut":
					tag.extract()
					self.debug("Removed breadcrumbs")
				else:
					tag.hidden = True
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Remove unwanted elements
			for tag in bodysoup.findAll(["style", "link"]):
				tag.extract()
				self.debug("Removed %s" % tag)
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Hide unnecessary elements
			for tag in bodysoup.findAll(["span", "div", "body", "font"]):
				self.debug("Set hidden: %s" % tag.name)
				tag.hidden = True
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Reformat forms
			for form in bodysoup.findAll("form"):
				pass
			
			# Hide non-semantic tables
			for table in bodysoup.findAll("table"):
				if table.get("border") != "1" and len(table.findAll("tr", recursive=False)) < 100 and not has_ancestor(table, "form"):
					table.hidden = True
					for tr in table.findAll("tr", recursive=False):
						tr.hidden = True
						for td in tr.findAll(["td", "th"], recursive=False):
							text = td.find(text=re.compile("[^\s]+", re.U))
							if text and text.strip() != "":
								td.name = "p"
								self.debug("Converted th/td into p: %s" % td)
							else:
								td.hidden = True
					self.debug("Hid non-semantic table")
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Remove orphan th/td/tr
			for el in bodysoup.findAll(["tr", "td", "th"]):
				if el.parent.name != "table" or (el.parent.parent and el.parent.parent.name != "table"):
					if el.name in ("td", "td"):
						text = el.find(text=re.compile("[^\s]+", re.U))
						if text and text.strip() != "":
							el.name = "p"
							self.debug("Converted td/th into p: %s" % el)
						else:
							el.hidden = True
							self.debug("Hid orphan %s: %s" % (el.name, el))
					else:
						el.hidden = True
						self.debug("Hid orphan %s: %s" % (el.name, el))
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Wrap NavigableStrings in td into p
			for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)):
				if tag.parent.name == "td" and tag.strip() != "":
					p = Tag(bodysoup, "p")
					p.insert(0, "%s" % tag)
					tag.replaceWith(p)
					self.debug("Moved from td into p: %s" % tag)
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Convert "loose" NavigableStrings into paragraphs
			for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)):
				if len(tag.strip()) > 10 and tag.parent.name == "[document]":
					p = Tag(bodysoup, "p")
					p.insert(0, "%s" % tag)
					tag.replaceWith(p)
					self.debug("Moved loose string into p: %s" % tag)
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Move NavigableStrings after list into p before moving lists
			for ul in bodysoup.findAll("ul"):
				if ul.parent.name == "p":
					next = ul.nextSibling
					if isinstance(next, NavigableString):
						p = Tag(bodysoup, "p")
						p.insert(0, "%s" % next)
						next.replaceWith(p)
						self.debug("Moved NavigableString after list into p: %s" % p)
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Move blocks outside paragraphs
			for block in bodysoup.findAll(["p", "ul", "h1", "h2"]):
				parent = block.parent
				if parent.name == "p":
					if block.name in ("h1", "h2"):
						parent.parent.insert(parent.parent.index(parent), block)
						self.debug("Moved %s before p" % block.name)
					else:
						parent.parent.insert(parent.parent.index(parent) + 1, block)
						self.debug("Moved %s after p" % block.name)
			
			# Delete depracated attributes
			for tag in bodysoup.findAll():
				for attr in ("align", "valign", "class", "style", "border", "vspace", "hspace", "cellpadding", "cellspacing"):
					del(tag[attr])
				for attr in ("width", "height"):
					if tag.name != "img":
						del(tag[attr])
				for attr in ("colspan", "rowspan"):
					if not tag.name in ("td", "tr"):
						del(tag[attr])
			bodysoup = BeautifulSoup(bodysoup.prettify())			
				
			# Import images
			for tag in bodysoup.findAll("img"):
				src = tag.get("src")
				if src and src.endswith(".gif") and src.find("/tyhja-") != -1:
					tag.extract()
				elif src and not src.startswith("http://"):
					img_path = os.path.dirname(os.path.join(self.source_dir, self.source_path)) + "/" + src
					if os.path.exists(img_path) and os.path.isfile(img_path):
						img_title = tag.get("title") or tag.get("alt") or ""
						img = Image.create_from_file(img_path, img_title[0:100])
						img.tmp_orig_path = src
						img.save()
						tag["src"] = img.file.url
						self.debug("Imported image: %s" % tag["src"])
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Import external files into Attachment models
			for a in bodysoup.findAll("a", href=re.compile(".+")):
				href = a.get("href")
				path, ext = os.path.splitext(href)
				if not href.startswith("http://") and ext != "" and not ext in (".html", ".shtml", ".php", ".jpg", ".gif", ".png"):
					if href.startswith("/"):
						abspath = self.source_dir + href
					else:
						abspath = self.source_dir + os.path.dirname(self.source_path) + "/" + href
					if os.path.exists(abspath):
						self.debug("Found attachment: %s" % abspath)
						self.attachments.append(abspath) # store for later import as we don't have page id yet
											
			
			# Remove bad linebreaks
			for br in bodysoup.findAll("br"):
				if br.parent.name == "p":
					for sib in (br.previousSibling, br.nextSibling):
						 if not sib or (isinstance(sib, NavigableString) and sib.strip() == ""):
							self.debug("Removed linebreak at (%s, %s)" % (br.parent.index(br), br.parent))
							br.extract()
							break
				elif br.parent.name == "[document]":
					br.extract()
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Clean up paragraphs
			for p in bodysoup.findAll("p"):
				non_sentence = lambda str:str != None and not str.strip().endswith(".") and 3 < len(str) < 30
				# Remove empty
				if p.string and p.string.strip() == "":
					self.debug("Removed empty p at (%s, %s)" % (p.parent.name, p.parent.index(p)))
					p.extract()
				# Hide if contains only tag(s)
				elif not p.findAll(text=re.compile(r"[^\s]+", re.U)):
					self.debug("Hid p with no text: %s" % p)
					p.hidden = True
				# Convert short one-liners into h3
				elif non_sentence(p.string) or (len(p.findAll(text=re.compile("[^\s]+", re.U))) == 1 and non_sentence(p.contents[0].string)):
					p.name = "h3"
					self.debug("Converted p into h3: %s" % p)
				# Remove bad styling
				else:
					tags = p.findAll(recursive=False)
					if len(tags) == 1 and tags[0].name in ("b", "u", "i"):
						#if not tags[].previousSibling and not el.nextSibling:
						#	el.hidden = True
						#self.debug("Hid %s from p, only child" % el.name)
						if not p.findAll(text=re.compile("[^\s]+", re.U), recursive=False):
							#print "!!! %s" % p
							tags[0].hidden = True
							self.debug("Hid %s from p, bad styling: %s" % (tags[0].name, p))
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Remove redundant information
			for text in bodysoup.findAll(text=re.compile(r"^\s*pdf-tiedosto [0-9]+ KB\s+$")):
				self.debug("Removed text: %s" % text)
				text.extract()
			bodysoup = BeautifulSoup(bodysoup.prettify())
				
			# Clean up headings
			for h in bodysoup.findAll(["h1", "h2", "h3", "h4", "h5", "h6"]):
				for el in h.findAll():
					# Remove styling elements (u, b, i, etc)
					if isinstance(el, Tag) and el.name != "a":
						el.hidden = True
						self.debug("Heading clean-up, hid %s in %s" % (el.name, h))
				try:
					# Move h1 at first
					if h.name == "h1" and h.parent.index(h) != 1:
							h.parent.insert(1, h)
							self.debug("Moved %s at first" % h.name)
					# Convert any heading at the beginning of document into h1
					elif h.name != "h1" and h.parent.name == "[document]" and not h.previousSibling:
						self.debug("Converted into h1: %s" % h)
						h.name = "h1"
				except IndexError:
					pass
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Convert internal links
			for a in bodysoup.findAll("a"):
				href = a.get("href")
				if href and not href.startswith("http://") and href.endswith(".shtml"):
					a["href"] = href.replace("/index.shtml", "").replace(".shtml", "").replace("_", "-")
					self.debug("Fixed link: %s -> %s" % (href, a["href"]))
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Parse content_title text
			h1 = bodysoup.find("h1")
			if h1:
				content_title = " ".join(h1.findAll(text=True))
				content_title = re.sub("[\s]+", " ", content_title).strip()
											
			# Reformat
			body = u"" + bodysoup.prettify().decode("UTF8")
			#print "type: %s" % type(body)
			#raise SystemExit()
			body = re.sub(r"\s+>\s+", " &gt; ", body)
			body = re.sub(r"\s+<\s+", " &lt; ", body)
			body = re.sub(r"[\n\r]+", " ", body)
			body = re.sub(r"[ \t]+", " ", body)
			body = re.sub(r">\s+", ">", body)
			body = re.sub(r"\s+<", "<", body)
			body = re.sub(r"</(p|h1|h2|h3|h4|h5|h6|ul|ol|table|tr)>", r"</\1>\n\n", body)
			body = re.sub(r"</(li|td)>", r"</\1>\n", body)
			body = re.sub(r"<(u|b|i|em|strong)>\s*", r" <\1>", body)
			body = re.sub(r"\s*</(u|b|i|em|strong)>", r"</\1> ", body)
			body = re.sub(r"</a>([^\-])", r"</a> \1", body)
			body = re.sub(r"<a ", " <a ", body)
			body = re.sub(r"\s+(\.|,|:|;|!|\?)", r"\1", body)
			
			# Is body valid UTF8?
			try:
				body.encode("UTF8")
			except UnicodeError:
				print "DAA"
				raise SystemExit()
		
		else:
			body = "(debug mode, no content parsed)"


		docsoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

		# nav title
		for text in docsoup.findAll(text=re.compile("^#include")):
			m = re.compile('"(.+)"').search(text)
			if m and m.lastindex == 1:
				include_path = m.group(1)
				if include_path.find("valikko") != -1:
					if not include_path.startswith("/"):
						if self.source_path.endswith("/index.shtml"):
							include_path = os.path.join(self.path, include_path)
						else:
							include_path = os.path.join("/".join(self.path.split("/")[0:-1]), include_path)
					f = open(self.source_dir + include_path)
					navsoup = BeautifulSoup(f.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
					f.close()
					for a in navsoup.findAll("a", href="/"+self.source_path):
						if a.get("class") == "valikon_tekstit" or a.parent.get("class") == "avattu_alavalikko":
							nav_title = a.find(text=re.compile("[^\s]+")).strip()
							break

		# meta title
		try:
			meta_title = docsoup.head.title.string
		except AttributeError:
			pass
		if meta_title:
			valid_meta_title_parts = []
			for part in [part.strip() for part in meta_title.split(" - ")]:
				if part not in ("BirdLife Suomi", u"Yhdessä lintujen puolesta"):
					valid_meta_title_parts.append(part)
			meta_title = u" – ".join(valid_meta_title_parts)

		# choose best title
		self.debug("Titles: nav: '%s', meta: '%s', content: '%s'" % (nav_title, meta_title, content_title))
		if nav_title:
			title = nav_title
			self.debug("Title choice: nav_title: %s" % title)
		elif content_title and meta_title and len(content_title) < len(meta_title):
			title = content_title
			self.debug("Title choice: content_title (shorter): %s" % title)
		elif meta_title:
			title = meta_title
			self.debug("Title choice: meta_title: %s" % title)
		elif content_title:
			title = content_title
			self.debug("Title choice: content_title: %s" % title)
		else:
			title = "%s (autogen)" % self.slug.capitalize()
			self.debug("Title choice: autogenerated from slug: %s" % title)

		if not title:
			raise ImportError("No title")
			
		if self.level == 0:
			if self.slug in ("liity", "suojelu", "lintuharrastus", "julkaisut", "yhdistys"):
				self.template = "osio.html"
			#else:
			#	self.level = 0
		if self.level == 0 and self.slug == "":
			self.template = "etusivu.html"
		
		self.title = title
		self.body = body