def _findISBN(self, rawtext): """Find an ISBN in a text. Argument rawtext -- the text to search for ISBN """ isbn = '' rawtext = filterHtml(rawtext) isbnindex = rawtext.rfind(_isbnkey) if(isbnindex >= 0): index = isbnindex + len(_isbnkey) try: while(rawtext[index].isspace() or rawtext[index] in _isbnseparators): index += 1 while((index < len(rawtext)) and (rawtext[index].isdigit() or rawtext[index].isspace())): if(rawtext[index].isdigit()): isbn = isbn + rawtext[index] index += 1 rawtext = rawtext[:isbnindex] + rawtext[index:] except IndexError: print('Illegal format in ISBN in blog post ' + self._rawtitle) isbn = '' return isbn
def _setUid(self): """Get the unique identifier for this blog post. The uid ends with the year and the parsed uid contains all text up to the first four consecutive digits. """ text = filterHtml(self._rawtext) myear = re.search(_yearRegexp, text) if (myear is None): raise Exception('Unable to parse year') year = myear.group(0) endpos = text.find(year) + _yearDigits self.uid = text[:endpos]
def _formattext(self): """Format the texts of this Item.""" self.title = replaceEscapeSequences(self._rawtitle) print('START ' + self.title) try: self._formattedtext = FormattedText(self._rawtext) except: #print('Error: Could not format text, falling back to plain text...') self._formattedtext = None self._strippedtext = filterHtml(self._rawtext) print('FELSLUT ' + self.title) else: print('S**T ' + self.title)
def _setUid(self): """Get the unique identifier for this blog post. The uid ends with the year and the parsed uid contains all text up to the first four consecutive digits. """ text = filterHtml(self._rawtext) myear = re.search(_yearRegexp, text) if(myear is None): raise Exception('Unable to parse year') year = myear.group(0) endpos = text.find(year) + _yearDigits self.uid = text[:endpos]
def getPlainText(self): """Return an unformatted text representation of this Item.""" return filterHtml(self._rawtext)
def __init__(self, content, reqlibrary=None): """Parse the information about the book. Optionally the book must be available at library reqlibrary or else a CouldNotReadBookError will be thrown. Arguments content -- XML string describing the book reqlibrary -- if specified, a book will only be parsed if it is available at this library """ # Extracting the description text if content.find("ctl00_lblDescr") != -1: startindex = content.find("ctl00_lblDescr") stopindex = content[startindex:].find("</span>") bookDescription = content[startindex + 16:startindex + stopindex] else: raise CouldNotReadBookError( 'Can\'t read book from Libra: No description found') self.rawtext = bookDescription # Extracting author metadata if content.find("/ff") != -1: startindex = content.find("/ff") stopindex = content[startindex + 5:].find("</a>") author = content[startindex + 5:startindex + stopindex + 5] else: author = "" self.author = author # Extracting the title - upgrade with regex if content.find("<td>Titel:</td>") != -1: startindex = content.find("<td>Titel:</td>") stopindex = content[startindex + 15:].find("</td>") title = content[startindex + 15:startindex + stopindex + 15] startindex = title.find(">") title = title[startindex + 1:] while title[-1] == '/' or title[-1] == ' ': title = title[0:len(title) - 1] else: title = "" self.title = title # Saving the ISBN of the title in question startindex = content.find("ctl00_imgCover") stopindex = content[startindex + 21:].find('"') imageUrl = content[startindex + 21:stopindex + startindex + 21] startindex = imageUrl.find("isbn=") stopindex = imageUrl[startindex + 5:].find("&") self.isbn = imageUrl[startindex + 5:startindex + 5 + stopindex] # Extract the location of the title in the library # startindex = content.find("dgHoldings_ctl02_lblShelf") # stopindex = content[startindex:].find("</span>") # shelf = content[startindex+27:startindex+stopindex] # self.shelf = filterHtml(shelf) # # startindex = content.find("dgHoldings_ctl02_lblBranchDepartment") # stopindex = content[startindex:].find("</span>") # libandsection = content[startindex+38:startindex+stopindex] # startindex = libandsection.find(' ') # startindex = libandsection.find(' ', startindex + 6) # # self.section = libandsection[startindex+6:] #Get library specific data startindex = content.find("lblBranchDepartment") finalstop = content.find("</table>", startindex) found = False self.shelves = [] while ((startindex >= 0) and (startindex < finalstop)): #Get library libandsection = _getTextTag(content, startindex) delimiter = libandsection.find(' ') library = libandsection[:delimiter] if ((reqlibrary is None) or (library == reqlibrary)): #Get section delimiter = libandsection.rfind(';') self.section = libandsection[delimiter + 1:] #Get shelf startindex = content.find("lblShelf", startindex) shelf = _getTextTag(content, startindex) shelf = filterHtml(shelf) self.shelves.append(shelf) #Get availability startindex = content.find("lblAvailable", startindex) self.available = _getTextTag(content, startindex) found = True startindex = content.find("lblBranchDepartment", startindex + 1) if (not found): raise CouldNotReadBookError('Book not present at library ' + reqlibrary) # Extract the books subjects self.subjects = [] startindex = content.find(u"Ämnesord") startindex = content.find("<td", startindex) finalstop = content.find("</td>", startindex) startindex = content.find("<a", startindex) while ((startindex >= 0) and (startindex < finalstop)): startindex = content.find(">", startindex) stopindex = content.find("<", startindex) subject = content[startindex + 1:stopindex] subject = subject.strip(subjectstrip) self.subjects.append(subject) startindex = content.find("<a", stopindex)
def __init__(self, content, reqlibrary = None): """Parse the information about the book. Optionally the book must be available at library reqlibrary or else a CouldNotReadBookError will be thrown. Arguments content -- XML string describing the book reqlibrary -- if specified, a book will only be parsed if it is available at this library """ # Extracting the description text if content.find("ctl00_lblDescr") != -1: startindex = content.find("ctl00_lblDescr") stopindex = content[startindex:].find("</span>") bookDescription = content[startindex+16:startindex + stopindex] else: raise CouldNotReadBookError('Can\'t read book from Libra: No description found') self.rawtext = bookDescription # Extracting author metadata if content.find("/ff") != -1: startindex = content.find("/ff") stopindex = content[startindex+5:].find("</a>") author = content[startindex+5:startindex + stopindex + 5] else: author = "" self.author = author # Extracting the title - upgrade with regex if content.find("<td>Titel:</td>") != -1: startindex = content.find("<td>Titel:</td>") stopindex = content[startindex+15:].find("</td>") title = content[startindex+15:startindex + stopindex + 15] startindex = title.find(">") title = title[startindex+1:] while title[-1] == '/' or title[-1] == ' ': title = title[0:len(title)-1] else: title = "" self.title = title # Saving the ISBN of the title in question startindex = content.find("ctl00_imgCover") stopindex = content[startindex+21:].find('"') imageUrl = content[startindex+21:stopindex+startindex+21] startindex = imageUrl.find("isbn=") stopindex = imageUrl[startindex+5:].find("&") self.isbn = imageUrl[startindex+5:startindex+5+stopindex] # Extract the location of the title in the library # startindex = content.find("dgHoldings_ctl02_lblShelf") # stopindex = content[startindex:].find("</span>") # shelf = content[startindex+27:startindex+stopindex] # self.shelf = filterHtml(shelf) # # startindex = content.find("dgHoldings_ctl02_lblBranchDepartment") # stopindex = content[startindex:].find("</span>") # libandsection = content[startindex+38:startindex+stopindex] # startindex = libandsection.find(' ') # startindex = libandsection.find(' ', startindex + 6) # # self.section = libandsection[startindex+6:] #Get library specific data startindex = content.find("lblBranchDepartment") finalstop = content.find("</table>", startindex) found = False self.shelves = [] while((startindex >= 0) and (startindex < finalstop)): #Get library libandsection = _getTextTag(content, startindex) delimiter = libandsection.find(' ') library = libandsection[:delimiter] if((reqlibrary is None) or (library == reqlibrary)): #Get section delimiter = libandsection.rfind(';') self.section = libandsection[delimiter + 1:] #Get shelf startindex = content.find("lblShelf", startindex) shelf = _getTextTag(content, startindex) shelf = filterHtml(shelf) self.shelves.append(shelf) #Get availability startindex = content.find("lblAvailable", startindex) self.available = _getTextTag(content, startindex) found = True startindex = content.find("lblBranchDepartment", startindex + 1) if(not found): raise CouldNotReadBookError('Book not present at library ' + reqlibrary) # Extract the books subjects self.subjects = [] startindex = content.find(u"Ämnesord") startindex = content.find("<td", startindex) finalstop = content.find("</td>", startindex) startindex = content.find("<a", startindex) while((startindex >= 0) and (startindex < finalstop)): startindex = content.find(">", startindex) stopindex = content.find("<", startindex) subject = content[startindex + 1:stopindex] subject = subject.strip(subjectstrip) self.subjects.append(subject) startindex = content.find("<a", stopindex)