Python filterHtml示例，formattedtext.filterHtml Python示例

示例#1

0

显示文件

文件： blogspotharvester.py 项目： d98mp/digitalaskyltar

    def _findISBN(self, rawtext):
        """Find an ISBN in a text. 
        
        Argument
        rawtext -- the text to search for ISBN
        
        """
        isbn = ''
        rawtext = filterHtml(rawtext)
        isbnindex = rawtext.rfind(_isbnkey)
        
        if(isbnindex >= 0):
            index = isbnindex + len(_isbnkey)
            
            try:
                while(rawtext[index].isspace() or rawtext[index] in _isbnseparators):
                    index += 1
            
                while((index < len(rawtext)) and (rawtext[index].isdigit() or rawtext[index].isspace())):
                    if(rawtext[index].isdigit()):
                        isbn = isbn + rawtext[index]
                    index += 1
                
                rawtext = rawtext[:isbnindex] + rawtext[index:]
            except IndexError:
                print('Illegal format in ISBN in blog post ' + self._rawtitle)
                isbn = ''

        return isbn

示例#2

0

显示文件

文件： wpharvester.py 项目： d98mp/digitalaskyltar

    def _setUid(self):
        """Get the unique identifier for this blog post.
        The uid ends with the year and the parsed uid contains all text up to
        the first four consecutive digits. 
        
        """
        text = filterHtml(self._rawtext)
        myear = re.search(_yearRegexp, text)

        if (myear is None):
            raise Exception('Unable to parse year')

        year = myear.group(0)
        endpos = text.find(year) + _yearDigits
        self.uid = text[:endpos]

示例#3

0

显示文件

文件： item.py 项目： d98mp/digitalaskyltar

 def _formattext(self):
     """Format the texts of this Item."""
     self.title = replaceEscapeSequences(self._rawtitle)
     
     print('START '  + self.title)
     
     try:
         self._formattedtext = FormattedText(self._rawtext)
     except:
         #print('Error: Could not format text, falling back to plain text...')
         self._formattedtext = None
         self._strippedtext = filterHtml(self._rawtext)
         print('FELSLUT '  + self.title)
     else:
         print('S**T '  + self.title)

示例#4

0

显示文件

文件： wpharvester.py 项目： d98mp/digitalaskyltar

    def _setUid(self):
        """Get the unique identifier for this blog post.
        The uid ends with the year and the parsed uid contains all text up to
        the first four consecutive digits. 
        
        """
        text = filterHtml(self._rawtext)
        myear = re.search(_yearRegexp, text)

        if(myear is None):
            raise Exception('Unable to parse year')
            
        year = myear.group(0)
        endpos = text.find(year) + _yearDigits
        self.uid = text[:endpos]

示例#5

0

显示文件

文件： item.py 项目： d98mp/digitalaskyltar

 def getPlainText(self):
     """Return an unformatted text representation of this Item."""
     return filterHtml(self._rawtext)

示例#6

0

显示文件

文件： itemharvester.py 项目： d98mp/digitalaskyltar

    def __init__(self, content, reqlibrary=None):
        """Parse the information about the book. Optionally the book must be available at
        library reqlibrary or else a CouldNotReadBookError will be thrown.
        
        Arguments
        content -- XML string describing the book
        reqlibrary -- if specified, a book will only be parsed if it is available at this library 
        
        """
        # Extracting the description text
        if content.find("ctl00_lblDescr") != -1:
            startindex = content.find("ctl00_lblDescr")
            stopindex = content[startindex:].find("</span>")
            bookDescription = content[startindex + 16:startindex + stopindex]
        else:
            raise CouldNotReadBookError(
                'Can\'t read book from Libra: No description found')

        self.rawtext = bookDescription

        # Extracting author metadata
        if content.find("/ff") != -1:
            startindex = content.find("/ff")
            stopindex = content[startindex + 5:].find("</a>")
            author = content[startindex + 5:startindex + stopindex + 5]
        else:
            author = ""

        self.author = author

        # Extracting the title - upgrade with regex
        if content.find("<td>Titel:</td>") != -1:
            startindex = content.find("<td>Titel:</td>")
            stopindex = content[startindex + 15:].find("</td>")
            title = content[startindex + 15:startindex + stopindex + 15]
            startindex = title.find(">")
            title = title[startindex + 1:]
            while title[-1] == '/' or title[-1] == ' ':
                title = title[0:len(title) - 1]

        else:
            title = ""

        self.title = title

        # Saving the ISBN of the title in question
        startindex = content.find("ctl00_imgCover")
        stopindex = content[startindex + 21:].find('"')
        imageUrl = content[startindex + 21:stopindex + startindex + 21]

        startindex = imageUrl.find("isbn=")
        stopindex = imageUrl[startindex + 5:].find("&")
        self.isbn = imageUrl[startindex + 5:startindex + 5 + stopindex]

        # Extract the location of the title in the library
        #        startindex = content.find("dgHoldings_ctl02_lblShelf")
        #        stopindex = content[startindex:].find("</span>")
        #        shelf = content[startindex+27:startindex+stopindex]
        #        self.shelf = filterHtml(shelf)
        #
        #        startindex = content.find("dgHoldings_ctl02_lblBranchDepartment")
        #        stopindex = content[startindex:].find("</span>")
        #        libandsection = content[startindex+38:startindex+stopindex]
        #        startindex = libandsection.find('&nbsp;')
        #        startindex = libandsection.find('&nbsp;', startindex + 6)
        #
        #        self.section = libandsection[startindex+6:]

        #Get library specific data
        startindex = content.find("lblBranchDepartment")
        finalstop = content.find("</table>", startindex)

        found = False
        self.shelves = []

        while ((startindex >= 0) and (startindex < finalstop)):
            #Get library
            libandsection = _getTextTag(content, startindex)
            delimiter = libandsection.find('&nbsp')
            library = libandsection[:delimiter]

            if ((reqlibrary is None) or (library == reqlibrary)):
                #Get section
                delimiter = libandsection.rfind(';')
                self.section = libandsection[delimiter + 1:]

                #Get shelf
                startindex = content.find("lblShelf", startindex)
                shelf = _getTextTag(content, startindex)
                shelf = filterHtml(shelf)
                self.shelves.append(shelf)

                #Get availability
                startindex = content.find("lblAvailable", startindex)
                self.available = _getTextTag(content, startindex)
                found = True

            startindex = content.find("lblBranchDepartment", startindex + 1)

        if (not found):
            raise CouldNotReadBookError('Book not present at library ' +
                                        reqlibrary)

        # Extract the books subjects
        self.subjects = []

        startindex = content.find(u"Ämnesord")
        startindex = content.find("<td", startindex)
        finalstop = content.find("</td>", startindex)
        startindex = content.find("<a", startindex)

        while ((startindex >= 0) and (startindex < finalstop)):
            startindex = content.find(">", startindex)
            stopindex = content.find("<", startindex)
            subject = content[startindex + 1:stopindex]
            subject = subject.strip(subjectstrip)
            self.subjects.append(subject)
            startindex = content.find("<a", stopindex)

示例#7

0

显示文件

文件： itemharvester.py 项目： d98mp/digitalaskyltar

    def __init__(self, content, reqlibrary = None):
        """Parse the information about the book. Optionally the book must be available at
        library reqlibrary or else a CouldNotReadBookError will be thrown.
        
        Arguments
        content -- XML string describing the book
        reqlibrary -- if specified, a book will only be parsed if it is available at this library 
        
        """
        # Extracting the description text
        if content.find("ctl00_lblDescr") != -1:
            startindex = content.find("ctl00_lblDescr")
            stopindex = content[startindex:].find("</span>")
            bookDescription = content[startindex+16:startindex + stopindex]
        else: 
            raise CouldNotReadBookError('Can\'t read book from Libra: No description found')
        
        self.rawtext = bookDescription
    
        # Extracting author metadata
        if content.find("/ff") != -1:
            startindex = content.find("/ff")    
            stopindex = content[startindex+5:].find("</a>")
            author = content[startindex+5:startindex + stopindex + 5]
        else: 
            author = ""
            
        self.author = author
        
        # Extracting the title - upgrade with regex
        if content.find("<td>Titel:</td>") != -1:
            startindex = content.find("<td>Titel:</td>")
            stopindex = content[startindex+15:].find("</td>")
            title = content[startindex+15:startindex + stopindex + 15]
            startindex = title.find(">")
            title = title[startindex+1:]
            while title[-1] == '/' or title[-1] == ' ':
                title = title[0:len(title)-1]
    
        else: 
            title = ""
            
        self.title = title
        
        # Saving the ISBN of the title in question
        startindex = content.find("ctl00_imgCover")
        stopindex = content[startindex+21:].find('"')
        imageUrl = content[startindex+21:stopindex+startindex+21]
    
        startindex = imageUrl.find("isbn=")
        stopindex = imageUrl[startindex+5:].find("&")
        self.isbn = imageUrl[startindex+5:startindex+5+stopindex]
    
    
        # Extract the location of the title in the library
#        startindex = content.find("dgHoldings_ctl02_lblShelf")
#        stopindex = content[startindex:].find("</span>")
#        shelf = content[startindex+27:startindex+stopindex]
#        self.shelf = filterHtml(shelf)
#        
#        startindex = content.find("dgHoldings_ctl02_lblBranchDepartment")
#        stopindex = content[startindex:].find("</span>")
#        libandsection = content[startindex+38:startindex+stopindex]
#        startindex = libandsection.find('&nbsp;')
#        startindex = libandsection.find('&nbsp;', startindex + 6)
#        
#        self.section = libandsection[startindex+6:]
        
        #Get library specific data
        startindex = content.find("lblBranchDepartment")
        finalstop = content.find("</table>", startindex)

        found = False
        self.shelves = []
        
        while((startindex >= 0) and (startindex < finalstop)):
            #Get library
            libandsection = _getTextTag(content, startindex)
            delimiter = libandsection.find('&nbsp')
            library = libandsection[:delimiter]
            
            if((reqlibrary is None) or (library == reqlibrary)):
                #Get section
                delimiter = libandsection.rfind(';')
                self.section = libandsection[delimiter + 1:]
                    
                #Get shelf
                startindex = content.find("lblShelf", startindex)
                shelf = _getTextTag(content, startindex)
                shelf = filterHtml(shelf)
                self.shelves.append(shelf)

                #Get availability
                startindex = content.find("lblAvailable", startindex)
                self.available = _getTextTag(content, startindex)
                found = True

            startindex = content.find("lblBranchDepartment", startindex + 1)

        if(not found):
            raise CouldNotReadBookError('Book not present at library ' + reqlibrary)

        # Extract the books subjects
        self.subjects = []
        
        startindex = content.find(u"Ämnesord")
        startindex = content.find("<td", startindex)
        finalstop = content.find("</td>", startindex)
        startindex = content.find("<a", startindex)
        
        while((startindex >= 0) and (startindex < finalstop)):
            startindex = content.find(">", startindex)
            stopindex = content.find("<", startindex)
            subject = content[startindex + 1:stopindex]
            subject = subject.strip(subjectstrip)
            self.subjects.append(subject)
            startindex = content.find("<a", stopindex)