Exemplo n.º 1
0
    def parse(self):
        if self.html is None:
            try:
                dw = self.fetch_page(self.url)
                self.html = dw['content']
                if self.html == None:
                    return None

                if dw['type'] == 'image':
                    result = {}
                    result['images'] = []
                    result['images'].append({'url': self.url})
                    p = urlparse(self.url)
                    if 'netloc' in p:
                        result['provider_display'] = p.netloc.lower()
                    else:
                        result['provider_display'] = ''
                    result['url'] = self.url
                    result['type'] = 'image'
                    result['description'] = ''
                    result['content'] = dw['content']
                    result['title'] = ''
                    return result

                if dw['type'] == 'text':
                    result = {}
                    result['images'] = []
                    result['url'] = self.url
                    result['type'] = 'article'
                    content = dw['content'].strip()
                    result['description'] = self.summarize(content, 75)
                    result['content'] = content
                    result['title'] = self.summarize(content, 10)
                    return result
                    
            except IOError:
                raise PageFetchError

        result = {}
        try:
            wp = WebParser(self.html, self.url)
            (self.dom_tree, self.html) = wp.normalize()
            result = wp.extract()
        except Exception, e:
            stack = traceback.format_stack(sys.exc_info()[2].tb_frame)
            ss = "".join(stack)
            tb = traceback.format_tb(sys.exc_info()[2])
            stb = "".join(tb)
            raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))
Exemplo n.º 2
0
    def extract_content(self):
        if self.html is None:
            try:
                dw = self.fetch_page(self.url)
                self.html = dw['content']
                if self.html == None:
                    return None

                if dw['type'] == 'image':
                    result = {}
                    result['content'] = '<img src="{0}"/>'.format(self.url)
                    return result

                if dw['type'] == 'text':
                    result = {}
                    result['content'] = dw['content']
                    return result

            except IOError:
                raise PageFetchError

        if self.dom_tree is None:
            wp = WebParser(self.html, self.url)
            (self.dom_tree, self.html) = wp.normalize()

        import SiteParser
        try:
            result = {}
            site = SiteParser.Sites(self.url)
            if site.is_match():
                result = site.parse(self.html, self.dom_tree)
                if 'content' in result:
                    # strip continous space
                    result['content'] = re.sub(r'\s+', ' ', result['content'])

                soul_tree = lxml.html.fromstring(result['content'])
                soul_text_only = soul_tree.text_content()
                s = self.summarize(soul_text_only, 75)
                result['description'] = s

                return result
            return None
        except Exception, e:
            stack = traceback.format_stack(sys.exc_info()[2].tb_frame)
            ss = "".join(stack)
            tb = traceback.format_tb(sys.exc_info()[2])
            stb = "".join(tb)
            raise WebSummarizeError("{0}\n{1}\n{2}".format(stb, ss, e))
Exemplo n.º 3
0
    def parse(self):
        if self.html is None:
            try:
                dw = self.fetch_page(self.url)
                self.html = dw['content']
                if self.html == None:
                    return None

                if dw['type'] == 'image':
                    result = {
                        'images': [{'url': self.url}],
                        'url': self.url,
                        'type': 'image',
                        'title': '',
                    }
                    return result

                if dw['type'] == 'text':
                    result = {
                        'images': [],
                        'url': self.url,
                        'type': 'article',
                        'title': '',
                    }
                    return result
                    
            except IOError:
                raise PageFetchError

        result = {}
        try:
            wp = WebParser(self.html, self.url)
            (self.dom_tree, self.html) = wp.normalize()
            result = wp.extractV2()
        except Exception, e:
            stack = traceback.format_stack(sys.exc_info()[2].tb_frame)
            ss = "".join(stack)
            tb = traceback.format_tb(sys.exc_info()[2])
            stb = "".join(tb)
            raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))