示例#1
0
def parse_html(d):
    folder_dict = defaultdict(dict)
    for f in range(0, 500):
        #print("In file " + str(f))
        key = str(d) +"/" + str(f)
        file = "WEBPAGES_RAW/" + key
        
        if os.path.isfile(file):
            content = []
            with codecs.open(file, "r", encoding="utf-8") as data:
                data = data.read().encode("ascii", "ignore")
                soup = BeautifulSoup(data)
                comments = soup.findAll(text=lambda text:isinstance(text, Comment))
                [comment.extract() for comment in comments] 
                [script.extract() for script in soup("script")]
                [style.extract() for style in soup("style")] 
                content = " ".join(item.strip() for item in soup.findAll(text=True))
                content = HTMLParser().unescape(content)
                content = content.encode("ascii", "ignore")
                content = content.decode('utf-8')

            pattern = re.compile('[\W_]+')
            content = pattern.sub(' ', content).lower().split()
  
            for word in content:
                if len(word) > 1:
                    if word not in folder_dict:
                        folder_dict[word] = defaultdict(int)
                    folder_dict[word][key] +=1
    return folder_dict
示例#2
0
 def extract(self, url, html):
     '''return (title, content)
     '''
     title, node = self.get_main_block(url, html)
     if node is None:
         print('\tno main block got !!!!!', url)
         return title, '', ''
     text = etree.tostring(node)
     text = HTMLParser().unescape(text.decode())
     print(text)
     content = self.get_text(node)
     return title, content
示例#3
0
def as_plain_text(value):
    """as_plain_text.

    :param value:
    """
    value = HTMLParser().unescape(value)

    if isinstance(value, six.binary_type):
        value = value.decode('utf-8')

    value = "<div>%s</div>" % value
    el = fragment_fromstring(value)
    texts = el.xpath('text()')

    return ' '.join(texts)
示例#4
0
async def insert_content(url: str, chapter_id: int) -> bool:
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            try:
                con = await resp.text()
            except UnicodeDecodeError:
                con = await resp.text(encoding='gbk')
            data = etree.HTML(con)
            chapter = ChapterLists.objects.get(pk=chapter_id)
            source = BookSource.objects.get(pk=chapter.source_id)
            contents = data.xpath(source.book_chapter_info)[0]
            contents = html.tostring(contents)
            contents = HTMLParser().unescape(contents.decode())
            words_number = len(contents)
            results = ChapterInfo.objects.create(content=contents, words_number=words_number, chapter_id=chapter_id)
            if results:
                return True
            else:
                return False
示例#5
0
    def get_main_block(self, url, html, short_title=True):
        ''' return (title, etree_of_main_content_block)
        '''
        if isinstance(html, bytes):
            encoding = cchardet.detect(html)['encoding']
            if encoding is None:
                return None, None
            html = html.decode(encoding, 'ignore')
        try:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(base_url=url)
        except:
            traceback.print_exc()
            return None, None
        self.title = self.get_title(doc)
        if short_title:
            self.title = self.shorten_title(self.title)
        body = doc.xpath('//body')
        text = etree.tostring(body[0])
        text = HTMLParser().unescape(text.decode())
        print(text)
        if not body:
            return self.title, None
        candidates = []
        nodes = body[0].getchildren()

        while nodes:
            node = nodes.pop(0)
            children = node.getchildren()

            tlen = 0
            for child in children:
                text1 = etree.tostring(child)
                text1 = HTMLParser().unescape(text1.decode())
                print(text1)
                if isinstance(child, HtmlComment):
                    continue
                if child.tag in self.non_content_tag:
                    continue
                if child.tag == 'a':
                    continue
                if child.tag == 'textarea':
                    # FIXME: this tag is only part of content?
                    continue
                attr = '%s%s%s' % (child.get('class', ''), child.get(
                    'id', ''), child.get('style'))
                if 'display' in attr and 'none' in attr:
                    continue
                nodes.append(child)
                if child.tag == 'p':
                    weight = 3
                else:
                    weight = 1
                text = '' if not child.text else child.text.strip()
                tail = '' if not child.tail else child.tail.strip()
                tlen += (len(text) + len(tail)) * weight
            if tlen < 10:
                continue
            weight = self.calc_node_weight(node)
            candidates.append((node, tlen * weight))
        if not candidates:
            return self.title, None
        candidates.sort(key=lambda a: a[1], reverse=True)
        good = candidates[0][0]
        if good.tag in ['p', 'pre', 'code', 'blockquote']:
            for i in range(5):
                good = good.getparent()
                if good.tag == 'div':
                    break
        good = self.clean_etree(good, url)
        return self.title, good