def get_book_info(fname): # only handles epub, mobi and opf for now, # for pdf see below res = {} if not '.' in fname: return res words = fname.split('.') extn = words[len(words) - 1] if extn == "mobi": try: book = Mobi(fname) book.parse() except: return res res['creator'] = book.author() res['title'] = book.title() res['language'] = book.language() res['identifier'] = book.isbn() return res # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # # if (extn == "pdf"): # pdf = PdfFileReader(open(fname, "rb")) # txt = pdf.getDocumentInfo() # repackage the data here to get components we need # res = {} # for s in ['title','language','creator']: # res[s] = txt[s] # res['identifier'] = txt['isbn'] # return res if extn == "epub": # prepare to read from the .epub file zipdata = zipfile.ZipFile(fname) # find the contents metafile txt = zipdata.read('META-INF/container.xml') tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = str(tree[0][n].attrib) if 'full-path' in att: cfname = ("%s" % att) # extract metadata filename cfname = cfname.split(',')[1].split(':')[1].strip('\' }') n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) tree = ElementTree.fromstring(txt) else: if extn == "opf": txt = open(fname).read() tree = ElementTree.fromstring(txt) else: return "" # repackage the data if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).split('}')[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib) isbn = "" if 'title' in tag.lower(): res['title'] = txt elif 'language' in tag.lower(): res['language'] = txt elif 'creator' in tag.lower(): res['creator'] = txt elif 'identifier' in tag.lower() and 'isbn' in attrib.lower(): if formatter.is_valid_isbn(txt): res['identifier'] = isbn n = n + 1 return res
def get_book_info(fname): # only handles epub, mobi and opf for now, # for pdf see below res = {} if '.' not in fname: return res words = fname.split('.') extn = words[len(words) - 1] if extn == "mobi": try: book = Mobi(fname) book.parse() except: return res res['creator'] = book.author() res['title'] = book.title() res['language'] = book.language() res['identifier'] = book.isbn() res['type'] = "mobi" return res """ # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # # if (extn == "pdf"): # pdf = PdfFileReader(open(fname, "rb")) # txt = pdf.getDocumentInfo() # repackage the data here to get components we need # res = {} # for s in ['title','language','creator']: # res[s] = txt[s] # res['identifier'] = txt['isbn'] # res['type'] = "pdf" # return res """ if extn == "epub": # prepare to read from the .epub file zipdata = zipfile.ZipFile(fname) # find the contents metafile txt = zipdata.read('META-INF/container.xml') tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = tree[0][n].attrib if 'full-path' in att: cfname = att['full-path'] n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) tree = ElementTree.fromstring(txt) res['type'] = "epub" else: if extn == "opf": txt = open(fname).read() tree = ElementTree.fromstring(txt) res['type'] = "opf" else: return "" # repackage the data if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).split('}')[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib) isbn = "" if 'title' in tag.lower(): res['title'] = txt elif 'language' in tag.lower(): res['language'] = txt elif 'creator' in tag.lower(): res['creator'] = txt elif 'identifier' in tag.lower() and 'isbn' in attrib.lower(): if formatter.is_valid_isbn(txt): res['identifier'] = txt n = n + 1 return res
def get_book_info(fname): # only handles epub, mobi and opf for now, # for pdf see notes below res = {} extn = os.path.splitext(fname)[1] if not extn: return res if extn == ".mobi": res['type'] = "mobi" try: book = Mobi(fname) book.parse() except: return res res['creator'] = book.author() res['title'] = book.title() res['language'] = book.language() res['identifier'] = book.isbn() return res # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # # if (extn == ".pdf"): # pdf = PdfFileReader(open(fname, "rb")) # txt = pdf.getDocumentInfo() # repackage the data here to get components we need # res = {} # for s in ['title','language','creator']: # res[s] = txt[s] # res['identifier'] = txt['isbn'] # res['type'] = "pdf" # return res elif extn == ".epub": res['type'] = "epub" # prepare to read from the .epub file try: zipdata = zipfile.ZipFile(fname) except: return res # find the contents metafile txt = zipdata.read('META-INF/container.xml') tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = tree[0][n].attrib if 'full-path' in att: cfname = att['full-path'] break n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) elif extn == ".opf": res['type'] = "opf" txt = open(fname).read() # sanitize any unmatched html tags or ElementTree won't parse dic = {'<br>': '', '</br>': ''} txt = replace_all(txt, dic) # repackage epub or opf metadata try: tree = ElementTree.fromstring(txt) except Exception as e: logger.error("Error parsing metadata from %s" % fname) logger.error(str(e)) return res if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).lower() if '}' in tag: tag = tag.split('}')[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib).lower() if 'title' in tag: res['title'] = txt elif 'language' in tag: res['language'] = txt elif 'creator' in tag: res['creator'] = txt elif 'identifier' in tag and 'isbn' in attrib: if is_valid_isbn(txt): res['identifier'] = txt n = n + 1 return res
def get_book_info(fname): # only handles epub, mobi and opf for now, # for pdf see below words = fname.split(".") extn = words[len(words) - 1] if extn == "mobi": book = Mobi(fname) book.parse() res = {} res["creator"] = book.author() res["title"] = book.title() res["language"] = book.language() res["identifier"] = book.isbn() return res # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # # if (extn == "pdf"): # pdf = PdfFileReader(open(fname, "rb")) # txt = pdf.getDocumentInfo() # repackage the data here to get components we need # res = {} # for s in ['title','language','creator','isbn']: # res[s] = txt[s] # return res if extn == "epub": # prepare to read from the .epub file zip = zipfile.ZipFile(fname) # find the contents metafile txt = zip.read("META-INF/container.xml") tree = ElementTree.fromstring(txt) n = 0 cfname = "" while n < len(tree[0]): att = tree[0][n].attrib if "full-path" in att: cfname = "%s" % att # extract metadata filename cfname = cfname.split(",")[1].split(":")[1].strip("' }") n = n + 1 # grab the metadata block from the contents metafile txt = zip.read(cfname) tree = ElementTree.fromstring(txt) else: if extn == "opf": txt = open(fname).read() tree = ElementTree.fromstring(txt) else: return "" # repackage the data - not too happy with this as there can be # several "identifier", only one of which is an isbn, how can we tell? # I just strip formatting, check for length, and check is only digits # except the last digit of an isbn10 may be 'X' res = {} n = 0 while n < len(tree[0]): tag = tree[0][n].tag.split("}")[1] txt = tree[0][n].text isbn = "" if "title" in tag.lower(): res["title"] = txt elif "language" in tag.lower(): res["language"] = txt elif "creator" in tag.lower(): res["creator"] = txt elif "identifier" in tag.lower(): if len(txt) == 13: if txt.isdigit(): isbn = txt elif len(txt) == 10: if txt[:8].isdigit(): isbn = txt res["identifier"] = isbn n = n + 1 return res
def get_book_info(fname): # only handles epub, mobi and opf for now, # for pdf see below res = {} extn = os.path.splitext(fname)[1] if not extn: return res if extn == ".mobi": try: book = Mobi(fname) book.parse() except: return res res["creator"] = book.author() res["title"] = book.title() res["language"] = book.language() res["identifier"] = book.isbn() res["type"] = "mobi" return res """ # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # # if (extn == ".pdf"): # pdf = PdfFileReader(open(fname, "rb")) # txt = pdf.getDocumentInfo() # repackage the data here to get components we need # res = {} # for s in ['title','language','creator']: # res[s] = txt[s] # res['identifier'] = txt['isbn'] # res['type'] = "pdf" # return res """ if extn == ".epub": # prepare to read from the .epub file zipdata = zipfile.ZipFile(fname) # find the contents metafile txt = zipdata.read("META-INF/container.xml") tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = tree[0][n].attrib if "full-path" in att: cfname = att["full-path"] n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) tree = ElementTree.fromstring(txt) res["type"] = "epub" else: if extn == ".opf": txt = open(fname).read() tree = ElementTree.fromstring(txt) res["type"] = "opf" else: return "" # repackage the data if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).split("}")[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib) isbn = "" if "title" in tag.lower(): res["title"] = txt elif "language" in tag.lower(): res["language"] = txt elif "creator" in tag.lower(): res["creator"] = txt elif "identifier" in tag.lower() and "isbn" in attrib.lower(): if formatter.is_valid_isbn(txt): res["identifier"] = txt n = n + 1 return res
def get_book_info(fname): # only handles epub, mobi, azw3 and opf for now, # for pdf see notes below res = {} extn = os.path.splitext(fname)[1] if not extn: return res if extn == ".mobi" or extn == ".azw3": res['type'] = extn[1:] try: book = Mobi(fname) book.parse() except Exception as e: logger.debug('Unable to parse mobi in %s, %s' % (fname, str(e))) return res res['creator'] = book.author() res['title'] = book.title() res['language'] = book.language() res['identifier'] = book.isbn() return res """ # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # if (extn == ".pdf"): pdf = PdfFileReader(open(fname, "rb")) txt = pdf.getDocumentInfo() # repackage the data here to get components we need res = {} for s in ['title','language','creator']: res[s] = txt[s] res['identifier'] = txt['isbn'] res['type'] = "pdf" return res """ elif extn == ".epub": res['type'] = "epub" # prepare to read from the .epub file try: zipdata = zipfile.ZipFile(fname) except Exception as e: logger.debug('Unable to parse zipfile %s, %s' % (fname, str(e))) return res # find the contents metafile txt = zipdata.read('META-INF/container.xml') tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = tree[0][n].attrib if 'full-path' in att: cfname = att['full-path'] break n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) elif extn == ".opf": res['type'] = "opf" txt = open(fname).read() # sanitize any unmatched html tags or ElementTree won't parse dic = {'<br>': '', '</br>': ''} txt = replace_all(txt, dic) # repackage epub or opf metadata try: tree = ElementTree.fromstring(txt) except Exception as e: logger.error("Error parsing metadata from %s, %s" % (fname, str(e))) return res if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).lower() if '}' in tag: tag = tag.split('}')[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib).lower() if 'title' in tag: res['title'] = txt elif 'language' in tag: res['language'] = txt elif 'creator' in tag: res['creator'] = txt elif 'identifier' in tag and 'isbn' in attrib: if is_valid_isbn(txt): res['identifier'] = txt n = n + 1 return res