def clean_content(content): #***************************************** # Additional filters and cleanups #***************************************** if content is not None: # Encode to simple ascii format. try: content = convertStrAscii(content) content = ignoreHtmlEntity(content) return content except UnicodeError, e: print e
def doc_ignore_content(soup): """ With beautiful soup's api, ignore content we are not interested in like comments""" # Attempt to extract script data strip_invalids = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in strip_invalids] # Remove SCRIPT and STYLE tags. [soup.script.extract() for script in soup("script")] [soup.style.extract() for style in soup("style")] # Only extract text content. txt_lst = soup.findAll(text=True) txt_lst = [ convertStrAscii(n) \ for n in txt_lst if len(n.strip()) > 1 ] doc_str = '\n'.join(txt_lst) return doc_str
def doc_ignore_content(soup): """ With beautiful soup's api, ignore content we are not interested in like comments""" # Attempt to extract script data strip_invalids = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in strip_invalids] # Remove SCRIPT and STYLE tags. [soup.script.extract() for script in soup("script")] [soup.style.extract() for style in soup("style")] # Only extract text content. txt_lst = soup.findAll(text=True) txt_lst = [ convertStrAscii(n) \ for n in txt_lst if len(n.strip()) > 1 ] doc_str = '\n'.join(txt_lst) return doc_str