Exemplo n.º 1
0
    def update(self, amount = 0):
        """Look for new blog posts.
        
        Argument
        amount -- if specified, amount blog posts will be harvested
        
        """
        #url = r'http://' + self._blogname + '.blogspot.com/feeds/posts/default'
        
        if(amount > 0):
            url = self._url + r'?redirect=false&max-results=' + amount
        else:
            url = self._url

        try:
            blogobj = urllib.urlopen(url)
        except IOError:
            print('Error: could not read url ' + url)
            return
        
        try:
            rssdoc = parse(blogobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            blogobj.close()
                
        nodes = rssdoc.getElementsByTagName(el_entry)
        newestId = getTextNodeValue(nodes.item(0), el_published)
        
        if(self.newestId != newestId):
            for i in nodes:
                id = getTextNodeValue(i, el_published)
    
                if(id == self.newestId):
                    break
    
                try:
                    bpd = BlogPostData(i)
                    bpi = self._itemtype(self.itemarg, blogpostdata = bpd)
                except Exception as e:
                    print('BlogspotHarvester: Could not create BlogspotItem; ' + str(e))
                    continue
                
                done = self._addandcheckfunc(bpi)
                
                if(done):
                    print(self.id + ': I don''t need to read more blog posts\n')
                    break

            self.newestId = newestId
        else:
            print('There are no new items; newest item @ ' + self.newestId)
Exemplo n.º 2
0
    def update(self):
        """Look for new blog posts."""
        #url = r'http://' + self._blogname + '.blogspot.com/feeds/posts/default'

        try:
            blogobj = urllib.urlopen(self._url)
        except IOError:
            print('Error: could not read url ' + self._url)
            return

        try:
            rssdoc = parse(blogobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            blogobj.close()

        nodes = rssdoc.getElementsByTagName(el_channel)

        if (nodes.length <= 0):
            raise Exception('No channel found in rss feed')

        chnode = nodes[0]
        nodes = chnode.getElementsByTagName(el_rssitem)
        newestId = getTextNodeValue(nodes.item(0), el_pubdate)

        if (self.newestId != newestId):
            for i in nodes:
                id = getTextNodeValue(i, el_pubdate)

                if (id == self.newestId):
                    break

                try:
                    item = WordPressItem(self.itemarg, rssxml=i)
                except:
                    print('WordpressHarvester: Could not create WordPressItem')
                    continue

                done = self._addandcheckfunc(item)

                if (done):
                    #print(self.id + ': I don''t need to read more blog posts\n')
                    break

            self.newestId = newestId
        else:
            print('There are no new items; newest item @ ' + self.newestId)
Exemplo n.º 3
0
    def update(self):
        """Look for new blog posts."""
        #url = r'http://' + self._blogname + '.blogspot.com/feeds/posts/default'

        try:
            blogobj = urllib.urlopen(self._url)
        except IOError:
            print('Error: could not read url ' + self._url)
            return
        
        try:
            rssdoc = parse(blogobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            blogobj.close()
                
        nodes = rssdoc.getElementsByTagName(el_channel)
        
        if(nodes.length <= 0):
            raise Exception('No channel found in rss feed')
        
        chnode = nodes[0]
        nodes = chnode.getElementsByTagName(el_rssitem)
        newestId = getTextNodeValue(nodes.item(0), el_pubdate)
        
        if(self.newestId != newestId):
            for i in nodes:
                id = getTextNodeValue(i, el_pubdate)
    
                if(id == self.newestId):
                    break
    
                try:
                    item = WordPressItem(self.itemarg, rssxml = i)
                except:
                    print('WordpressHarvester: Could not create WordPressItem')
                    continue

                done = self._addandcheckfunc(item)
                
                if(done):
                    #print(self.id + ': I don''t need to read more blog posts\n')
                    break

            self.newestId = newestId
        else:
            print('There are no new items; newest item @ ' + self.newestId)
Exemplo n.º 4
0
    def _tryGetBookInfo(self, isbn, library):
        url = self._searchprefix + isbn + self._searchsuffix

        try:
            rssobj = urllib.urlopen(url)
        except IOError:
            print('Error: Could not read url ' + url)
            return False

        try:
            rssdoc = parse(rssobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return False
        finally:
            rssobj.close()

        nodes = rssdoc.getElementsByTagName(el_item)

        if(nodes.length < 0):
            return False

        for i in nodes:
            link = getTextNodeValue(i, el_link)
            
            try:
                data = harvestBookInfo(link, library)
            except Exception as e:
                print('BlogPostItem ' + self._rawtitle + ':\n  ' + e.value)
            else:
                self._selectShelf(data.shelves)
                self.section = data.section
                return True

        return False
Exemplo n.º 5
0
 def __init__(self, entrynode):
     """Create BlogPostData
     
     Argument
     entrynode -- XML node containing the data that will be extracted
     
     """
     self.subjects = []
     
     self.id = getTextNodeValue(entrynode, el_id)
     self.title = getTextNodeValue(entrynode, el_title)
     self.content = getTextNodeValue(entrynode, el_content)
     
     categorynodes = entrynode.getElementsByTagName(el_category)
     
     for i in categorynodes:
         self.subjects.append(i.attributes[attr_term].value)
Exemplo n.º 6
0
 def _loadfromcache(self, xmlnode):
     """Load the Item from cache
     
     Argument
     xmlnode -- XML node describing this item
     
     """
     self._rawtitle = xmlnode.attributes[attr_title].value
     self.uid = xmlnode.attributes[attr_uid].value
     self._rawtext = getTextNodeValue(xmlnode, el_rawtext)
Exemplo n.º 7
0
        Arguments
        dir -- the cache directory
        dims -- tuple containing normal width and height of the image
        smalldims -- tuple containing small width and height of the image
        cachexml -- if not None the WordPressItem will be loaded from this cache node
        rssxml -- if not None the WordPressItem will be loaded from this RSS XML element
        
        """
        ImageItem.__init__(self, dims, smalldims)

        if (cachexml is not None):
            self._loadfromcache(cachexml, dir)
            self._setId()
            self._loadimage(dims, smalldims)
        elif (rssxml is not None):
            title = getTextNodeValue(rssxml, el_title)
            self._rawtitle = title
            self._rawtext = getCDataNodeValue(rssxml, el_rsscontent)
            self._setUid()
            self._setId()
            self._imagename = ''

        self._formattext()

        if (rssxml is not None):
            if (self._formattedtext is not None):
                imagesrc = self._formattedtext.getFirstImage()

                if (imagesrc is not None):
                    #TBD This is not good
                    imagesrc = imagesrc.split('?')[0]
Exemplo n.º 8
0
        Arguments
        dir -- the cache directory
        dims -- tuple containing normal width and height of the image
        smalldims -- tuple containing small width and height of the image
        cachexml -- if not None the WordPressItem will be loaded from this cache node
        rssxml -- if not None the WordPressItem will be loaded from this RSS XML element
        
        """
        ImageItem.__init__(self, dims, smalldims)

        if(cachexml is not None):
            self._loadfromcache(cachexml, dir)
            self._setId()
            self._loadimage(dims, smalldims)
        elif(rssxml is not None):
            title = getTextNodeValue(rssxml, el_title)
            self._rawtitle = title
            self._rawtext = getCDataNodeValue(rssxml, el_rsscontent)
            self._setUid()
            self._setId()
            self._imagename = ''
            
        self._formattext()
        
        if(rssxml is not None):
            if(self._formattedtext is not None):
                imagesrc = self._formattedtext.getFirstImage()
                
                if(imagesrc is not None):
                    #TBD This is not good
                    imagesrc = imagesrc.split('?')[0]
Exemplo n.º 9
0
    def _readRssChannel(self, url):
        """Extract books from an OPAC RSS channel until no more books need 
        to be extracted
        
        Argument
        url -- url to the RSS channel
        
        """
        #Get rss data
        try:
            rssobj = urllib.urlopen(url)
        except IOError:
            print('Error: Could not read url ' + url)
            return

        try:
            rssdoc = parse(rssobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            rssobj.close()
        
        nodes = rssdoc.getElementsByTagName(el_channel)
        
        if(nodes.length < 0):
            raise Exception('No channel found in rss feed')
        
        chnode = nodes[0]
        ctr = 0
        newItems = []
        nodes = chnode.getElementsByTagName(el_rssitem)

        if(nodes.length > 0):
            id = self._getId(nodes.item(0))
            
            if(id == self.newestId):
                print(self.id + ': No more new items')
            else:
                newestId = id

                #Extract items from rss data
                for i in nodes:
                    if(i.nodeType == i.ELEMENT_NODE):
                        id = self._getId(i)
                        
                        #Check if the element is new
                        if(id == self.newestId):
                            print(self.id + ': No more new items')
                            break
                        
                        #Check if the element comes from the correct library
                        desc = getCDataNodeValue(i, el_rssdesc)
                        library = self._getlibrary(desc)
                        
                        if(library == self._library):
                            url = getTextNodeValue(i, el_link)
                            
                            try:
                                item = self._itemclass(self.itemarg, url = url.strip())
                            except:
                                print('VarbergOpacHarvester: Could not create OpacBookItem')
                                continue

                            done = self._addandcheckfunc(item)
            
                            if(done):
                                print(self.id + ': I don''t need to read more items\n')
                                break

                self.newestId = newestId
Exemplo n.º 10
0
    def _readRssChannel(self, url):
        """Extract books from an OPAC RSS channel until no more books need 
        to be extracted
        
        Argument
        url -- url to the RSS channel
        
        """
        #Get rss data
        try:
            rssobj = urllib.urlopen(url)
        except IOError:
            print('Error: Could not read url ' + url)
            return

        try:
            rssdoc = parse(rssobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            rssobj.close()

        nodes = rssdoc.getElementsByTagName(el_channel)

        if (nodes.length < 0):
            raise Exception('No channel found in rss feed')

        chnode = nodes[0]
        ctr = 0
        newItems = []
        nodes = chnode.getElementsByTagName(el_rssitem)

        if (nodes.length > 0):
            id = self._getId(nodes.item(0))

            if (id == self.newestId):
                print(self.id + ': No more new items')
            else:
                newestId = id

                #Extract items from rss data
                for i in nodes:
                    if (i.nodeType == i.ELEMENT_NODE):
                        id = self._getId(i)

                        #Check if the element is new
                        if (id == self.newestId):
                            print(self.id + ': No more new items')
                            break

                        #Check if the element comes from the correct library
                        desc = getCDataNodeValue(i, el_rssdesc)
                        library = self._getlibrary(desc)

                        if (library == self._library):
                            url = getTextNodeValue(i, el_link)

                            try:
                                item = self._itemclass(self.itemarg,
                                                       url=url.strip())
                            except:
                                print(
                                    'VarbergOpacHarvester: Could not create OpacBookItem'
                                )
                                continue

                            done = self._addandcheckfunc(item)

                            if (done):
                                print(self.id + ': I don'
                                      't need to read more items\n')
                                break

                self.newestId = newestId