def ocrPublication(pathToMemex, citationKey, language): # generate and create necessary paths # path to the folder publPath = functions.generatePublPath(pathToMemex, citationKey) # path to the pdf file pdfFile = os.path.join(publPath, citationKey + ".pdf") # path to the json file jsonFile = os.path.join(publPath, citationKey + ".json") # OCR results will be saved here # path to the pages folder saveToPath = os.path.join(publPath, "pages") # we will save processed images here # generate CLEAN pdf (necessary if you added highlights and comments to your PDFs) pdfFileTemp = removeCommentsFromPDF(pdfFile) # first we need to check whether this publication has been already processed if not os.path.isfile(jsonFile): # let's make sure that saveToPath also exists if not os.path.exists(saveToPath): # create folder pages os.makedirs(saveToPath) # start process images and extract text print("\t>>> OCR-ing: %s" % citationKey) # create the dictionary textResults textResults = {} # create the list images images = pdf2image.convert_from_path(pdfFileTemp) # length of the list pageTotal = len(images) # set pageCount pageCount = 1 # loop through the list images for image in images: image = image.convert('1') # binarizes image, reducing its size # create the path for each image file finalPath = os.path.join(saveToPath, "%04d.png" % pageCount) # save the image image.save(finalPath, optimize=True, quality=10) # get the text from the image text = pytesseract.image_to_string(image, lang=language) # save the text to the dictionary textResults textResults["%04d" % pageCount] = text # write the process to terminal print("\t\t%04d/%04d pages" % (pageCount, pageTotal)) # increase pageCount pageCount += 1 # create the json file with open(jsonFile, 'w', encoding='utf8') as f9: # write textResults to the file json.dump(textResults, f9, sort_keys=True, indent=4, ensure_ascii=False) else: # in case JSON file already exists print("\t>>> %s has already been OCR-ed..." % citationKey) # delete the temporary pdf file os.remove(pdfFileTemp)
def createIndex(pathToMemex): bibData = functions.loadBib(settings["bib_all"]) with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() completeList = [] for k, v in bibData.items(): path = functions.generatePublPath(memexPath, k) entry = "<tr><td><li><a href=" + "@PATHTOPUBL@/pages/DETAILS.html>" + "[@CITEKEY@]</a></td><td> @AUTHOR@</td> <td>(@DATE@)</td> - <td><i>@TITLE@</i></td></li></tr>" entry = entry.replace("@PATHTOPUBL@", path) entry = entry.replace("@CITEKEY@", k) if "author" in v: entry = entry.replace("@AUTHOR@", v["author"]) else: entry = entry.replace("@AUTHOR@", "MISSING") if "year" in v: entry = entry.replace("@DATE@", v["year"]) else: entry = entry.replace("@DATE@", "MISSING") if "title" in v: entry = entry.replace("@TITLE@", v["title"]) else: entry = entry.replace("@TITLE@", "MISSING") completeList.append(entry) content = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(completeList)) content = content.replace("{", "") content = content.replace("}", "") toc = formatSearches(pathToMemex) template = template.replace("@SEARCHES@", toc) template = template.replace( "@PUBLICATIONS@", publTemplate.replace("@TABLECONTENTS@", content)) with open(os.path.join(pathToMemex, "searchesInterface.html"), "w", encoding="utf8") as f9: f9.write(template)
def generateMemexStartingPages(pathToMemex): # load index template with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() # add index.html with open(settings["content_index"], "r", encoding="utf8") as fi: indexData = fi.read() with open(os.path.join(pathToMemex, "index.html"), "w", encoding="utf8") as f9: f9.write(template.replace("@MAINCONTENT@", indexData)) # load bibliographical data for processing publicationDic = {} # key = citationKey; value = recordDic for subdir, dirs, files in os.walk(pathToMemex): for file in files: if file.endswith(".bib"): pathWhereBibIs = os.path.join(subdir, file) tempDic = functions.loadBib(pathWhereBibIs) publicationDic.update(tempDic) # generate data for the main CONTENTS singleItemTemplate = '<li><a href="@RELATIVEPATH@/pages/DETAILS.html">[@CITATIONKEY@]</a> @AUTHOROREDITOR@ (@DATE@) - <i>@TITLE@</i></li>' contentsList = [] for citeKey,bibRecord in publicationDic.items(): relativePath = functions.generatePublPath(pathToMemex, citeKey).replace(pathToMemex, "") authorOrEditor = "[No data]" if "editor" in bibRecord: authorOrEditor = bibRecord["editor"] if "author" in bibRecord: authorOrEditor = bibRecord["author"] if "date" in bibRecord: date = bibRecord["date"] else: print("nodate") date = bibRecord["date"][:4] title = bibRecord["title"] # forming a record recordToAdd = singleItemTemplate recordToAdd = recordToAdd.replace("@RELATIVEPATH@", relativePath) recordToAdd = recordToAdd.replace("@CITATIONKEY@", citeKey) recordToAdd = recordToAdd.replace("@AUTHOROREDITOR@", authorOrEditor) recordToAdd = recordToAdd.replace("@DATE@", date) recordToAdd = recordToAdd.replace("@TITLE@", title) recordToAdd = recordToAdd.replace("{", "").replace("}", "") contentsList.append(recordToAdd) contents = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(contentsList)) mainContent = "<h1>CONTENTS of MEMEX</h1>\n\n" + contents # save the CONTENTS page with open(os.path.join(pathToMemex, "contents.html"), "w", encoding="utf8") as f9: f9.write(template.replace("@MAINCONTENT@", mainContent))
def generateDoclLink(bibTexCode, pageVal, distance): pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) bib = bib[bibTexCode] author = "N.d." if "editor" in bib: author = bib["editor"] if "author" in bib: author = bib["author"] reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"]) search = unicodedata.normalize('NFKD', reference).encode('ascii','ignore') search = " <div class='hidden'>%s</div>" % search if pageVal == 0: # link to the start of the publication htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "DETAILS.html") htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) page = "" startPage = 0 else: startPage = pageVal - 5 endPage = pageVal if startPage == 0: startPage += 1 htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "%04d.html" % startPage) htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) page = ", pdfPp. %d-%d</i></a>" % (startPage, endPage) publicationInfo = reference + page + search publicationInfo = publicationInfo.replace("{", "").replace("}", "") singleItemTemplate = '<tr><td>%s</td><td>%f</td><td data-order="%s%05d">%s</td></tr>' % (htmlLink, distance, bibTexCode, startPage, publicationInfo) return(singleItemTemplate)
def checkPageNumbers(bib, bibTexCode, startPage): page = 0 if "pages" in bib.keys(): bibPages = functions.prettifyBib(bib["pages"]) bibPagesList = list(bibPages.split("--")) bibPagesList = [int(i) for i in bibPagesList] pathToPubl = functions.generatePublPath(memexPath, bibTexCode) jsonFile = os.path.join(pathToPubl, "%s.json" % bibTexCode) with open(jsonFile) as jsonData: ocred = json.load(jsonData) pNumList = ocred.keys() if len(pNumList) > (bibPagesList[1] - bibPagesList[0] + 1): if startPage == 1: page = "TITLE" else: page = startPage + bibPagesList[0] - 2 else: page = startPage + bibPagesList[0] - 1 else: page = startPage return (page)
def processAllClouds(filename): docData = json.load(open(filename, "r", encoding="utf8")) for k, v in docData.items(): savePath = functions.generatePublPath(memexPath, k) savePath = savePath + "\\" + k if v: createwordCloud(savePath, k)
def formatPublList( pathToMemex ): #define a function for the formatting of the publications ocrFiles = functions.dicOfRelevantFiles( pathToMemex, settings["ocr_results"]) #take the files with the OCRed pages bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib") #take the bibFiles contentsList = [] #create an empty list for key, value in ocrFiles.items(): #loop through the OCRed pages if key in bibFiles: #search for the key in the bibFile bibRecord = functions.loadBib( bibFiles[key]) #load the bibliographical data for this item bibRecord = bibRecord[key] #take the key relativePath = functions.generatePublPath( pathToMemex, key).replace(pathToMemex, "") #take the relative path to the publication authorOrEditor = "[No data]" #take no information on the author as default setting if "editor" in bibRecord: #check if there is information about the editor authorOrEditor = bibRecord["editor"] #insert it if "author" in bibRecord: #check if there is information about the author authorOrEditor = bibRecord["author"] #insert it date = bibRecord["year"][:4] #insert the year of the publication title = bibRecord["title"] #insert the title # formatting template citeKey = '<div class="ID">[%s]</div>' % key #take the citeKey publication = '%s (%s) <i>%s</i>' % ( authorOrEditor, date, title ) #take the information about the publication and format it search = unicodedata.normalize('NFKD', publication).encode( 'ascii', 'ignore' ) #replace diacritical characters with their ascii equivalents publication += " <div class='hidden'>%s</div>" % search #repeat the information and hide it link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath #add the link to the details page of each publication singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % ( link, citeKey, publication ) #collect the information in a single template recordToAdd = singleItemTemplate.replace("{", "").replace( "}", "") #remove curly brackets contentsList.append( recordToAdd) #add the single records to the content list contents = "\n".join(sorted(contentsList)) #join the sorted content list final = publicationsTemplate.replace( "@TABLECONTENTS@", contents ) #replace the wildcard in the template with the actual content return (final) #return this variable
def ocrPublication( pathToMemex, citationKey, language ): ## ocr function takes path, citationkey and language as argument publPath = functions.generatePublPath( pathToMemex, citationKey ) ## generates path that gets us to the file with the citekey name pdfFile = os.path.join(publPath, citationKey + ".pdf") ## generates pdf jsonFile = os.path.join(publPath, citationKey + ".json") ## generates json file saveToPath = os.path.join( publPath, "pages") ## creates new folder for all the ocr-ed pages if not os.path.isfile( jsonFile ): ## checks if there is a json file to see if it has been ocr-ed already if not os.path.exists( saveToPath): ## if not it makes one and starts the process os.makedirs(saveToPath) print( "\t>>> OCR-ing: %s" % citationKey ) ## shows us that it is ocr-ing the pdf and the citationkey of that one textResults = {} ## creates dictionary for results images = pdf2image.convert_from_path( pdfFile) ## creates the images of the single pages in the pdf pageTotal = len( images ) ## to know how many pages have been processed; always adds 1 pageCount = 1 for image in images: ## loops through the images text = pytesseract.image_to_string( image, lang=language) ## analyses the string with the given language textResults["%04d" % pageCount] = text image = image.convert('1') # binarizes image, reducing its size finalPath = os.path.join( saveToPath, "%04d.png" % pageCount) ## saves the pages into pages folder image.save(finalPath, optimize=True, quality=10) print("\t\t%04d/%04d pages" % (pageCount, pageTotal)) pageCount += 1 with open(jsonFile, 'w', encoding='utf8') as f9: json.dump(textResults, f9, sort_keys=True, indent=4, ensure_ascii=False) ## dumps results into json file else: print("\t>>> %s has already been OCR-ed..." % citationKey ) ## if it finds the json file in the beginning it prints this
def processAllclouds(filename): docData = json.load(open(filename, "r", encoding="utf8")) ## loads tfidf file for k, v in docData.items(): ###loop through the file savePath = functions.generatePublPath(memexPath, k) ##create path for file savePath = savePath + "\\" + k if v: createWordCloud(savePath, v) ### create wordcloud
def ocrPublication(citationKey, language, pageLimit): # generate and create necessary paths publPath = functions.generatePublPath(settings["path_to_memex"], citationKey) pdfFile = os.path.join(publPath, citationKey + ".pdf") jsonFile = os.path.join(publPath, citationKey + ".json") # OCR results will be saved here saveToPath = os.path.join(publPath, "pages") # we will save processed images here # first we need to check whether this publication has been already processed if not os.path.isfile(jsonFile): # let's make sure that saveToPath also exists if not os.path.exists(saveToPath): os.makedirs(saveToPath) # start process images and extract text print("\t>>> OCR-ing: %s" % citationKey) textResults = {} images = pdf2image.convert_from_path(pdfFile) pageTotal = len(images) pageCount = 1 if pageTotal <= int(pageLimit): for image in images: text = pytesseract.image_to_string(image, lang=language) textResults["%04d" % pageCount] = text image = image.convert( '1') # binarizes image, reducing its size finalPath = os.path.join(saveToPath, "%04d.png" % pageCount) image.save(finalPath, optimize=True, quality=10) print("\t\t%04d/%04d pages" % (pageCount, pageTotal)) pageCount += 1 with open(jsonFile, 'w', encoding='utf8') as f9: json.dump(textResults, f9, sort_keys=True, indent=4, ensure_ascii=False) else: print( "\t%d: the length of the publication exceeds current limit (%d)" % (pageTotal, pageLimit)) print( "\tIncrease `page_limit` in settings to process this publication." ) else: # in case JSON file already exists print("\t>>> %s has already been OCR-ed..." % citationKey)
def generateReferenceSimple(bibTexCode): pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) bib = bib[bibTexCode] author = "N.d." if "editor" in bib: author = bib["editor"] if "author" in bib: author = bib["author"] reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"]) reference = reference.replace("{", "").replace("}", "") return(reference)
def genConnectedTexts(citeKey): similarities = json.load( open("cosineTableDic_filtered.txt", "r", encoding="utf8")) contentTemp = "<tr><td><i><a href='@link@'>read</a></i></td><td>@Sim@</td><td>@Publication@</td></tr>" if similarities: temp = similarities[citeKey] content = "" for k, v in temp.items(): content = content + contentTemp.replace("@Publication@", k) content = content.replace("@Sim@", str(v)) link = "..\\..\\..\\..\\." + functions.generatePublPath( memexPath, k) + "\\pages\\DETAILS.html" content = content.replace("@link@", link) return (content)
def formatPublList(pathToMemex): ocrFiles = functions.dicOfRelevantFiles(pathToMemex, settings["ocr_results"]) bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib") contentsList = [] for key, value in ocrFiles.items(): if key in bibFiles: bibRecord = functions.loadBib(bibFiles[key]) bibRecord = bibRecord[key] relativePath = functions.generatePublPath(pathToMemex, key).replace( pathToMemex, "") authorOrEditor = "[No data]" if "editor" in bibRecord: authorOrEditor = bibRecord["editor"] if "author" in bibRecord: authorOrEditor = bibRecord["author"] date = "nodate" if "year" in bibRecord: date = bibRecord["year"] title = bibRecord["title"] # formatting template citeKey = '<div class="ID">[%s]</div>' % key publication = '%s (%s) <i>%s</i>' % (authorOrEditor, date, title) search = unicodedata.normalize('NFKD', publication).encode( 'ascii', 'ignore') publication += " <div class='hidden'>%s</div>" % search link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % ( link, citeKey, publication) recordToAdd = singleItemTemplate.replace("{", "").replace("}", "") contentsList.append(recordToAdd) contents = "\n".join(sorted(contentsList)) final = publicationsTemplate.replace("@TABLECONTENTS@", contents) return (final)
def generateDoclLink(bibTexCode, pageVal, distance): pathToPubl = functions.generatePublPath(memexPath, bibTexCode) bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) bib = bib[bibTexCode] author = "N.d." if "editor" in bib: author = bib["editor"] if "author" in bib: author = bib["author"] reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"]) search = unicodedata.normalize('NFKD', reference).encode('ascii', 'ignore') search = " <div class='hidden'>%s</div>" % search if pageVal == 0: # link to the start of the publication htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"), "pages", "DETAILS.html") htmlLink = "<a href='{0}'>[{1}]</a>".format(htmlLink, bibTexCode) page = "" startPage = 0 else: startPage = pageVal - 5 endPage = pageVal if startPage == 0: startPage += 1 realStartPage = checkPageNumbers(bib, bibTexCode, startPage) realEndPage = checkPageNumbers(bib, bibTexCode, endPage) htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"), "pages", "%04d.html" % startPage) htmlLink = "<a href='{0}'>[{1},{2}]</a>".format( htmlLink, bibTexCode, realStartPage) page = ", pp. {0}-{1}</i></a>".format(realStartPage, realEndPage) publicationInfo = reference + page + search publicationInfo = publicationInfo.replace("{", "").replace("}", "") singleItemTemplate = '<tr><td data-order="{1}{2:05d}"><div class="ID">{3}</div> {4}</td><td>{0:f}</td></tr>'.format( distance, bibTexCode, startPage, htmlLink, publicationInfo) return (singleItemTemplate)