def parse_html(d): folder_dict = defaultdict(dict) for f in range(0, 500): #print("In file " + str(f)) key = str(d) +"/" + str(f) file = "WEBPAGES_RAW/" + key if os.path.isfile(file): content = [] with codecs.open(file, "r", encoding="utf-8") as data: data = data.read().encode("ascii", "ignore") soup = BeautifulSoup(data) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] [script.extract() for script in soup("script")] [style.extract() for style in soup("style")] content = " ".join(item.strip() for item in soup.findAll(text=True)) content = HTMLParser().unescape(content) content = content.encode("ascii", "ignore") content = content.decode('utf-8') pattern = re.compile('[\W_]+') content = pattern.sub(' ', content).lower().split() for word in content: if len(word) > 1: if word not in folder_dict: folder_dict[word] = defaultdict(int) folder_dict[word][key] +=1 return folder_dict
def extract(self, url, html): '''return (title, content) ''' title, node = self.get_main_block(url, html) if node is None: print('\tno main block got !!!!!', url) return title, '', '' text = etree.tostring(node) text = HTMLParser().unescape(text.decode()) print(text) content = self.get_text(node) return title, content
def as_plain_text(value): """as_plain_text. :param value: """ value = HTMLParser().unescape(value) if isinstance(value, six.binary_type): value = value.decode('utf-8') value = "<div>%s</div>" % value el = fragment_fromstring(value) texts = el.xpath('text()') return ' '.join(texts)
async def insert_content(url: str, chapter_id: int) -> bool: async with aiohttp.ClientSession() as session: async with session.get(url) as resp: try: con = await resp.text() except UnicodeDecodeError: con = await resp.text(encoding='gbk') data = etree.HTML(con) chapter = ChapterLists.objects.get(pk=chapter_id) source = BookSource.objects.get(pk=chapter.source_id) contents = data.xpath(source.book_chapter_info)[0] contents = html.tostring(contents) contents = HTMLParser().unescape(contents.decode()) words_number = len(contents) results = ChapterInfo.objects.create(content=contents, words_number=words_number, chapter_id=chapter_id) if results: return True else: return False
def get_main_block(self, url, html, short_title=True): ''' return (title, etree_of_main_content_block) ''' if isinstance(html, bytes): encoding = cchardet.detect(html)['encoding'] if encoding is None: return None, None html = html.decode(encoding, 'ignore') try: doc = lxml.html.fromstring(html) doc.make_links_absolute(base_url=url) except: traceback.print_exc() return None, None self.title = self.get_title(doc) if short_title: self.title = self.shorten_title(self.title) body = doc.xpath('//body') text = etree.tostring(body[0]) text = HTMLParser().unescape(text.decode()) print(text) if not body: return self.title, None candidates = [] nodes = body[0].getchildren() while nodes: node = nodes.pop(0) children = node.getchildren() tlen = 0 for child in children: text1 = etree.tostring(child) text1 = HTMLParser().unescape(text1.decode()) print(text1) if isinstance(child, HtmlComment): continue if child.tag in self.non_content_tag: continue if child.tag == 'a': continue if child.tag == 'textarea': # FIXME: this tag is only part of content? continue attr = '%s%s%s' % (child.get('class', ''), child.get( 'id', ''), child.get('style')) if 'display' in attr and 'none' in attr: continue nodes.append(child) if child.tag == 'p': weight = 3 else: weight = 1 text = '' if not child.text else child.text.strip() tail = '' if not child.tail else child.tail.strip() tlen += (len(text) + len(tail)) * weight if tlen < 10: continue weight = self.calc_node_weight(node) candidates.append((node, tlen * weight)) if not candidates: return self.title, None candidates.sort(key=lambda a: a[1], reverse=True) good = candidates[0][0] if good.tag in ['p', 'pre', 'code', 'blockquote']: for i in range(5): good = good.getparent() if good.tag == 'div': break good = self.clean_etree(good, url) return self.title, good