예제 #1
0
def clean_content(content):

    #*****************************************
    # Additional filters and cleanups
    #*****************************************
    if content is not None:
        # Encode to simple ascii format.
        try:
            content = convertStrAscii(content)
            content = ignoreHtmlEntity(content)
            return content
        except UnicodeError, e:
            print e
def clean_content(content):
	
	#*****************************************
	# Additional filters and cleanups
	#*****************************************		
	if content is not None:
		# Encode to simple ascii format.
		try:
			content = convertStrAscii(content)
			content = ignoreHtmlEntity(content)
			return content
		except UnicodeError, e:
			print e
예제 #3
0
def doc_ignore_content(soup):
    """ With beautiful soup's api, ignore content
	we are not interested in like comments"""

    # Attempt to extract script data
    strip_invalids = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in strip_invalids]

    # Remove SCRIPT and STYLE tags.
    [soup.script.extract() for script in soup("script")]
    [soup.style.extract() for style in soup("style")]

    # Only extract text content.
    txt_lst = soup.findAll(text=True)
    txt_lst = [ convertStrAscii(n) \
       for n in txt_lst if len(n.strip()) > 1 ]
    doc_str = '\n'.join(txt_lst)
    return doc_str
def doc_ignore_content(soup):
	""" With beautiful soup's api, ignore content
	we are not interested in like comments"""
	
	# Attempt to extract script data
	strip_invalids = soup.findAll(text=lambda text:isinstance(text, Comment))
	[comment.extract() for comment in strip_invalids]

	# Remove SCRIPT and STYLE tags.
	[soup.script.extract() for script in soup("script")]
	[soup.style.extract() for style in soup("style")]
		
	# Only extract text content.
	txt_lst = soup.findAll(text=True)
	txt_lst = [ convertStrAscii(n) \
				for n in txt_lst if len(n.strip()) > 1 ]
	doc_str = '\n'.join(txt_lst)
	return doc_str