def parse(self): if self.html is None: try: dw = self.fetch_page(self.url) self.html = dw['content'] if self.html == None: return None if dw['type'] == 'image': result = {} result['images'] = [] result['images'].append({'url': self.url}) p = urlparse(self.url) if 'netloc' in p: result['provider_display'] = p.netloc.lower() else: result['provider_display'] = '' result['url'] = self.url result['type'] = 'image' result['description'] = '' result['content'] = dw['content'] result['title'] = '' return result if dw['type'] == 'text': result = {} result['images'] = [] result['url'] = self.url result['type'] = 'article' content = dw['content'].strip() result['description'] = self.summarize(content, 75) result['content'] = content result['title'] = self.summarize(content, 10) return result except IOError: raise PageFetchError result = {} try: wp = WebParser(self.html, self.url) (self.dom_tree, self.html) = wp.normalize() result = wp.extract() except Exception, e: stack = traceback.format_stack(sys.exc_info()[2].tb_frame) ss = "".join(stack) tb = traceback.format_tb(sys.exc_info()[2]) stb = "".join(tb) raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))
def extract_content(self): if self.html is None: try: dw = self.fetch_page(self.url) self.html = dw['content'] if self.html == None: return None if dw['type'] == 'image': result = {} result['content'] = '<img src="{0}"/>'.format(self.url) return result if dw['type'] == 'text': result = {} result['content'] = dw['content'] return result except IOError: raise PageFetchError if self.dom_tree is None: wp = WebParser(self.html, self.url) (self.dom_tree, self.html) = wp.normalize() import SiteParser try: result = {} site = SiteParser.Sites(self.url) if site.is_match(): result = site.parse(self.html, self.dom_tree) if 'content' in result: # strip continous space result['content'] = re.sub(r'\s+', ' ', result['content']) soul_tree = lxml.html.fromstring(result['content']) soul_text_only = soul_tree.text_content() s = self.summarize(soul_text_only, 75) result['description'] = s return result return None except Exception, e: stack = traceback.format_stack(sys.exc_info()[2].tb_frame) ss = "".join(stack) tb = traceback.format_tb(sys.exc_info()[2]) stb = "".join(tb) raise WebSummarizeError("{0}\n{1}\n{2}".format(stb, ss, e))
def parse(self): if self.html is None: try: dw = self.fetch_page(self.url) self.html = dw['content'] if self.html == None: return None if dw['type'] == 'image': result = { 'images': [{'url': self.url}], 'url': self.url, 'type': 'image', 'title': '', } return result if dw['type'] == 'text': result = { 'images': [], 'url': self.url, 'type': 'article', 'title': '', } return result except IOError: raise PageFetchError result = {} try: wp = WebParser(self.html, self.url) (self.dom_tree, self.html) = wp.normalize() result = wp.extractV2() except Exception, e: stack = traceback.format_stack(sys.exc_info()[2].tb_frame) ss = "".join(stack) tb = traceback.format_tb(sys.exc_info()[2]) stb = "".join(tb) raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))