Пример #1
0
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content)
        content = re.sub(re.compile(r"©$", flags=re.MULTILINE), " ",
                         content)

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)

        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')
        unneededText = (('div', 'tagContainer'), ('div', 'tags'),
                        ('div', 'moreStories'), ('ul', 'links'))
        for tagName, className in unneededText:
            results = soup.findAll(tagName, {"class": className})
            [result.extract() for result in results]

        results = soup.findAll("div", {"class": "body-text"})

        if len(results) != 1:
            raise scraper.FeedException(
                'Number of div class="body-text" in HTML is not 1. Count = %d'
                % len(results))

        self.saveStory(url, title, content, results[0])
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content)

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)

        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')
        unneededText = (('div', 'MorebyThisAuthor'), ('div', 'RelatedStories'),
                        ('div', 'Comments'), ('div', 'ToolBarHorizontal'))
        for tagName, className in unneededText:
            results = soup.findAll(tagName, {"id": className})
            [result.extract() for result in results]

        results = soup.findAll('div', {"id": 'gridMainColumn'})

        if len(results) != 1:
            raise scraper.FeedException(
                'Number of primary-content ids in HTML is not 1. Count = %d' %
                len(results))

        self.saveStory(url, title, content, results[0])
Пример #3
0
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$",  flags=re.MULTILINE), "", content)

        title = re.search(r"<title>(.*)\n?</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)
            
        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')
        unneededText = (
            ('div', 'articleSocialBar'),
            ('div', 'pluck'),
            )
        for tagName, className in unneededText:
            results = soup.findAll(tagName, { "id" : className })
            [result.extract() for result in results]
        
            
        results = soup.findAll('div', { "data-swiftype-name" : 'body' })
        
        if len(results) != 1:
            raise scraper.FeedException('Number of primary-content ids in HTML is not 1. Count = %d' % len(results))
            
        self.saveStory(url, title, content, results[0])
Пример #4
0
 def parseResponse(self, url, content):
     content = content.strip()
     content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content)
     title = re.search(r"<title>(.*)</title>", content)
     if title == None:
         title = "Missing"
     else:
         title = title.group(1)
     content = self.cleanScripts(content)
     soup = BeautifulSoup(content, 'html.parser')
     results = soup.findAll('div', {'class': 'main-container'})
     if len(results) != 1:
         raise scraper.FeedException(
             'Number of story-body ids in HTML is not 1. Count = %d URL = %s'
             % (len(results), url))
     self.saveStory(url, title, content, results[0])
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content)
        content = re.sub(re.compile(r"\r", flags=re.MULTILINE), " ", content)

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)

        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')

        unneededClassText = (
            ('div', 'commentsform'),
            ('section', 'clearfix'),
            ('h5', 'add-comment'),
            ('p', 'comments-disclaimer'),
            ('a', 'edit_from_site'),
            ('h2', ''),
            ('link', ''),
        )
        for tagName, className in unneededClassText:
            results = soup.findAll(tagName, {"class": className})
            [result.extract() for result in results]

        unneededIdText = (('span', 'topic'), )
        for tagName, idName in unneededIdText:
            results = soup.findAll(tagName, {"id": idName})
            [result.extract() for result in results]

        #results = soup.findAll("article", { "class" : "container" })
        results = soup.findAll("div", {"class": "post"})

        if len(results) != 1:
            raise scraper.FeedException(
                'Number of div class="body" in HTML is not 1. Count = %d' %
                len(results))

        self.saveStory(url, title, content, results[0])
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$",  flags=re.MULTILINE), "", content)
        content = re.sub(re.compile(r"\r",  flags=re.MULTILINE), " ", content)

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)

        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')
        
        results = soup.findAll("div", { "id" : lambda val : val is not None and val.startswith("single-post") } )
        
        if len(results) != 1:
            raise scraper.FeedException('Number of div id="single-post-*" in HTML is not 1. Count = %d' % len(results))
        
        self.saveStory(url, title, content, results[0])
Пример #7
0
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content)

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)

        content = self.cleanScripts(content)

        soup = BeautifulSoup(content)
        results = soup.findAll('div', {"id": 'WNContainerStory'})

        if len(results) != 1:
            raise scraper.FeedException(
                'Number of primary-content ids in HTML is not 1. Count = %d' %
                len(results))

        self.saveStory(url, title, content, results[0])
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$",  flags=re.MULTILINE), "", content)

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)
            
        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')
        #results = soup.findAll(id=re.compile(r'\bprimary-content\b'))
        results = soup.findAll('div', { 'class': 'postContent', 'itemprop':'articleBody' })
        if not results:
            results = soup.findAll('div', { 'id': 'bw-share' })
        
        if len(results) != 1:
            raise scraper.FeedException('Number of primary-content ids in HTML is not 1. Count = %d' % len(results))
            
        self.saveStory(url, title, content, results[0])
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$",  flags=re.MULTILINE), "", content)
        content = re.sub(re.compile(r"&copy;$",  flags=re.MULTILINE), " ", content)
        
        content = self.cleanScripts(content)
        
        # They don't know the difference between span and div
        # Their spans need to be converted for the soup to work
        content = re.sub("<span ", "<div ", content)
        content = re.sub("</span>", "</div>", content)
        
        soup = BeautifulSoup(content, 'html.parser')
        
        def t(v):
            print "=====", v
            return True
        
        results = soup.findAll("div", { "class" : "article-body" })
        
        if len(results) == 0:
            raise scraper.FeedException('Number of tables in HTML is not 1. Count = %d' % len(results))
        
        resultHTML = ""
        for r in results:
            resultHTML += "<div>%s</div>" % str(r)
            
        resultHTML = "".join(i for i in resultHTML if ord(i)<128)
        
        titleResults = soup.findAll("div", { "class" : "fp-newshead" })

        if len(titleResults) != 1:
            title = "Missing"
        else:
            title = str(titleResults[0].contents[0])
            title = re.sub(r'<[^>]{1,}>', '', title)
            
        self.saveStory(url, title, content, resultHTML)
Пример #10
0
    def parseResponse(self, url, content):
        content = content.strip()
        content = re.sub(re.compile(r"^\s+$", flags=re.MULTILINE), "", content)
        content = re.sub(re.compile(r"&copy;$", flags=re.MULTILINE), " ",
                         content)

        content = self.cleanScripts(content)

        soup = BeautifulSoup(content, 'html.parser')

        results = soup.findAll('article')

        if len(results) != 1:
            raise scraper.FeedException(
                'Number of div class="body" in HTML is not 1. Count = %d' %
                len(results))

        title = re.search(r"<title>(.*)</title>", content)
        if title == None:
            title = "Missing"
        else:
            title = title.group(1)

        self.saveStory(url, title, content, results[0])