def generateDoclLink(bibTexCode, pageVal, distance): pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) bib = bib[bibTexCode] author = "N.d." if "editor" in bib: author = bib["editor"] if "author" in bib: author = bib["author"] reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"]) search = unicodedata.normalize('NFKD', reference).encode('ascii','ignore') search = " <div class='hidden'>%s</div>" % search if pageVal == 0: # link to the start of the publication htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "DETAILS.html") htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) page = "" startPage = 0 else: startPage = pageVal - 5 endPage = pageVal if startPage == 0: startPage += 1 htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "%04d.html" % startPage) htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) page = ", pdfPp. %d-%d</i></a>" % (startPage, endPage) publicationInfo = reference + page + search publicationInfo = publicationInfo.replace("{", "").replace("}", "") singleItemTemplate = '<tr><td>%s</td><td>%f</td><td data-order="%s%05d">%s</td></tr>' % (htmlLink, distance, bibTexCode, startPage, publicationInfo) return(singleItemTemplate)
def createIndex(pathToMemex): bibData = functions.loadBib(settings["bib_all"]) with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() completeList = [] for k, v in bibData.items(): path = functions.generatePublPath(memexPath, k) entry = "<tr><td><li><a href=" + "@PATHTOPUBL@/pages/DETAILS.html>" + "[@CITEKEY@]</a></td><td> @AUTHOR@</td> <td>(@DATE@)</td> - <td><i>@TITLE@</i></td></li></tr>" entry = entry.replace("@PATHTOPUBL@", path) entry = entry.replace("@CITEKEY@", k) if "author" in v: entry = entry.replace("@AUTHOR@", v["author"]) else: entry = entry.replace("@AUTHOR@", "MISSING") if "year" in v: entry = entry.replace("@DATE@", v["year"]) else: entry = entry.replace("@DATE@", "MISSING") if "title" in v: entry = entry.replace("@TITLE@", v["title"]) else: entry = entry.replace("@TITLE@", "MISSING") completeList.append(entry) content = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(completeList)) content = content.replace("{", "") content = content.replace("}", "") toc = formatSearches(pathToMemex) template = template.replace("@SEARCHES@", toc) template = template.replace( "@PUBLICATIONS@", publTemplate.replace("@TABLECONTENTS@", content)) with open(os.path.join(pathToMemex, "searchesInterface.html"), "w", encoding="utf8") as f9: f9.write(template)
def generateMemexStartingPages(pathToMemex): # load index template with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() # add index.html with open(settings["content_index"], "r", encoding="utf8") as fi: indexData = fi.read() with open(os.path.join(pathToMemex, "index.html"), "w", encoding="utf8") as f9: f9.write(template.replace("@MAINCONTENT@", indexData)) # load bibliographical data for processing publicationDic = {} # key = citationKey; value = recordDic for subdir, dirs, files in os.walk(pathToMemex): for file in files: if file.endswith(".bib"): pathWhereBibIs = os.path.join(subdir, file) tempDic = functions.loadBib(pathWhereBibIs) publicationDic.update(tempDic) # generate data for the main CONTENTS singleItemTemplate = '<li><a href="@RELATIVEPATH@/pages/DETAILS.html">[@CITATIONKEY@]</a> @AUTHOROREDITOR@ (@DATE@) - <i>@TITLE@</i></li>' contentsList = [] for citeKey,bibRecord in publicationDic.items(): relativePath = functions.generatePublPath(pathToMemex, citeKey).replace(pathToMemex, "") authorOrEditor = "[No data]" if "editor" in bibRecord: authorOrEditor = bibRecord["editor"] if "author" in bibRecord: authorOrEditor = bibRecord["author"] if "date" in bibRecord: date = bibRecord["date"] else: print("nodate") date = bibRecord["date"][:4] title = bibRecord["title"] # forming a record recordToAdd = singleItemTemplate recordToAdd = recordToAdd.replace("@RELATIVEPATH@", relativePath) recordToAdd = recordToAdd.replace("@CITATIONKEY@", citeKey) recordToAdd = recordToAdd.replace("@AUTHOROREDITOR@", authorOrEditor) recordToAdd = recordToAdd.replace("@DATE@", date) recordToAdd = recordToAdd.replace("@TITLE@", title) recordToAdd = recordToAdd.replace("{", "").replace("}", "") contentsList.append(recordToAdd) contents = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(contentsList)) mainContent = "<h1>CONTENTS of MEMEX</h1>\n\n" + contents # save the CONTENTS page with open(os.path.join(pathToMemex, "contents.html"), "w", encoding="utf8") as f9: f9.write(template.replace("@MAINCONTENT@", mainContent))
def processAllEntries(pathToMemex): bibData = functions.loadBib(settings["bib_all"]) #loads the bib file for k,v in bibData.items(): path = functions.generatePublPath(memexPath, k) path = path + "\\" + k +".bib" interface.generatePublicationInterface(k, path)
def processAllFiles(pathToMemex): bibData = functions.loadBib(settings["bib_all"]) #loads the bib file for k, v in bibData.items(): lang = checkLangId(v, defaultLang) functions.ocrPublication(pathToMemex, k, lang)
def generateDoclLink(bibTexCode, pageVal, distance): pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) #take the bibTex-Code bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) #load the bibTex-Code bib = bib[bibTexCode] #define a variable author = "N.d." #take no information on the author as default setting if "editor" in bib: #check if there is information about the editor author = bib["editor"] #insert it if "author" in bib: #check if there is information about the author author = bib["author"] #insert it reference = "%s (%s). <i>%s</i>" % (author, bib["year"][:4], bib["title"]) #take information about a publication and format it search = unicodedata.normalize('NFKD', reference).encode('ascii','ignore') #replace diacritical characters with their ascii equivalents search = " <div class='hidden'>%s</div>" % search #repeat the information and hide it if pageVal == 0: # link to the start of the publication htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "DETAILS.html") #create an html-link to the details page htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) #add the link page = "" #define the variable page startPage = 0 #define the startPage as 0 else: startPage = pageVal - 5 #define the startPage endPage = pageVal #define the endPage if startPage == 0: #if the startPage is the details page startPage += 1 #add one to the startPage htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "%04d.html" % startPage) #create an html-link to the startPage htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) #add the html-page page = ", pdfPp. %d-%d</i></a>" % (startPage, endPage) #add the pagecluster with startPage and endPage publicationInfo = reference + page + search #join the variables together publicationInfo = publicationInfo.replace("{", "").replace("}", "") #remove the curly brackets singleItemTemplate = '<tr><td>%s</td><td>%f</td><td data-order="%s%05d">%s</td></tr>' % (htmlLink, distance, bibTexCode, startPage, publicationInfo) #create a template for the indvidual item return(singleItemTemplate) #return this variable
def generateContentsList(): relDic = functions.dicOfRelevantFiles(memexPath, "bib") contentsList = [] for k, v in relDic.items(): k = k[:-1] bibDic = functions.loadBib(v) authorOrEditor = "[No data]" if "editor" in bibDic[k]: authorOrEditor = bibDic[k]["editor"] if "author" in bibDic[k]: authorOrEditor = bibDic[k]["author"] publication = "{0} ({1}) <i>{2}</i>".format(authorOrEditor, bibDic[k]["date"], bibDic[k]["title"]) search = unicodedata.normalize('NFKD', publication).encode('ascii', 'ignore') publication += " <div class=\"hidden\">{0}</div>".format(search) contentsList.append( "<tr><td><div class=\"ID\"><a href=\"{0}/pages/DETAILS.html\">[{1}]</a></div> {2}</td></tr>" .format(os.path.join(k[0], k[:2], k), k, publication)) contentsListSorted = sorted(contentsList) contentsList = "".join(contentsListSorted) mainElement = publicationsTemplate.replace("@TABLECONTENTS@", contentsList) return (mainElement)
def formatPublList( pathToMemex ): #define a function for the formatting of the publications ocrFiles = functions.dicOfRelevantFiles( pathToMemex, settings["ocr_results"]) #take the files with the OCRed pages bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib") #take the bibFiles contentsList = [] #create an empty list for key, value in ocrFiles.items(): #loop through the OCRed pages if key in bibFiles: #search for the key in the bibFile bibRecord = functions.loadBib( bibFiles[key]) #load the bibliographical data for this item bibRecord = bibRecord[key] #take the key relativePath = functions.generatePublPath( pathToMemex, key).replace(pathToMemex, "") #take the relative path to the publication authorOrEditor = "[No data]" #take no information on the author as default setting if "editor" in bibRecord: #check if there is information about the editor authorOrEditor = bibRecord["editor"] #insert it if "author" in bibRecord: #check if there is information about the author authorOrEditor = bibRecord["author"] #insert it date = bibRecord["year"][:4] #insert the year of the publication title = bibRecord["title"] #insert the title # formatting template citeKey = '<div class="ID">[%s]</div>' % key #take the citeKey publication = '%s (%s) <i>%s</i>' % ( authorOrEditor, date, title ) #take the information about the publication and format it search = unicodedata.normalize('NFKD', publication).encode( 'ascii', 'ignore' ) #replace diacritical characters with their ascii equivalents publication += " <div class='hidden'>%s</div>" % search #repeat the information and hide it link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath #add the link to the details page of each publication singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % ( link, citeKey, publication ) #collect the information in a single template recordToAdd = singleItemTemplate.replace("{", "").replace( "}", "") #remove curly brackets contentsList.append( recordToAdd) #add the single records to the content list contents = "\n".join(sorted(contentsList)) #join the sorted content list final = publicationsTemplate.replace( "@TABLECONTENTS@", contents ) #replace the wildcard in the template with the actual content return (final) #return this variable
def generateContentPage(): # # #### #### Pseudocode: #### funkion: #### template öffnen #### @PATHTOPUBL@, [@CITEKEY@], @AUTHOR@ (@DATE@) @TITLE@ + Link zur details(.html) des Textes #### dictionary mit diesen variablen #### loop um jeweils einen citekey/text/ #### Input: citekey,Path zum file, usw. -> Output Liste ### # load individual bib record # bibFile = pathToBibFile #bibDic = functions.loadBib(bibFile) #bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"]) #### #pageTemp = template #pageTemp = pageTemp.replace("@PATHTOPUBL@", v) #pageTemp = pageTemp.replace("@CITEKEY@", v) #pageTemp = pageTemp.replace("@AUTHOR@", v) #pageTemp = pageTemp.replace("@DATE@", v) #pageTemp = pageTemp.replace("@TITLE@", v) detailfileDic = functions.generatePageLinks(pNums) ### # load page template # wir brauchen template_index.html with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() # load page template #with open(settings["template_page"], "r", encoding="utf8") as ft: #template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"]) orderedPages = list(detailfileDic.keys()) for o in range(0, len(orderedPages)): #print(o) k = orderedPages[o] v = pageDic[orderedPages[o]] pageTemp = template pageTemp = pageTemp.replace("@PAGELINKS@", v) pageTemp = pageTemp.replace("@PATHTOFILE@", "") pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)
def generateContents (...): #generate a list of publications with links #join it with the index page template # shoud look like this: <li><a href="@PATHTOPUBL@/pages/DETAILS.html">[@CITEKEY@]</a> @AUTHOR@ (@DATE@) - <i>@TITLE@</i></li> #load the bib file to get all the values # load the index template file # write the <li> etc. into the content # replace @author with author etc. # open as contents.html # # #### #### Pseudocode: #### funkion: #### template öffnen #### @PATHTOPUBL@, [@CITEKEY@], @AUTHOR@ (@DATE@) @TITLE@ + Link zur details(.html) des Textes #### dictionary mit diesen variablen #### loop um jeweils einen citekey/text/ #### Input: citekey,Path zum file, usw. -> Output Liste ### # load individual bib record # bibFile = pathToBibFile #bibDic = functions.loadBib(bibFile) #bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"]) #### # pageTemp = template # pageTemp = pageTemp.replace("@PATHTOPUBL@", v) #pageTemp = pageTemp.replace("@CITEKEY@", v) #pageTemp = pageTemp.replace("@AUTHOR@", v) #pageTemp = pageTemp.replace("@DATE@", v) #pageTemp = pageTemp.replace("@TITLE@", v) detailfileDic = functions.generatePageLinks(pNums) ### # load page template # wir brauchen template_index.html with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() # load page template #with open(settings["template_page"], "r", encoding="utf8") as ft: #template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"]) orderedPages = list(detailfileDic.keys()) for o in range(0, len(orderedPages)): #print(o) k = orderedPages[o] v = pageDic[orderedPages[o]] pageTemp = template pageTemp = pageTemp.replace("@PAGELINKS@", v) pageTemp = pageTemp.replace("@PATHTOFILE@", "") pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)
def generateReferenceSimple(bibTexCode): pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) bib = bib[bibTexCode] author = "N.d." if "editor" in bib: author = bib["editor"] if "author" in bib: author = bib["author"] reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"]) reference = reference.replace("{", "").replace("}", "") return(reference)
def generateReferenceSimple(bibTexCode): pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) #take the bibTexCode bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) #load the bibTexCode bib = bib[bibTexCode] #define a variable author = "N.d." #take no information on the author as default setting if "editor" in bib: #check if there is information about the editor author = bib["editor"] #insert it if "author" in bib: #check if there is information about the author author = bib["author"] #insert it reference = "%s (%s). <i>%s</i>" % (author, bib["year"][:4], bib["title"]) #take information about a publication and format it reference = reference.replace("{", "").replace("}", "") #remove the curly brackets return(reference) #return this variable
def processAllRecords(bibDataFile): bibData = functions.loadBib(bibDataFile) keys = list(bibData.keys()) random.shuffle(keys) for key in keys: bibRecord = bibData[key] functions.processBibRecord(settings["path_to_memex"], bibRecord) language = functions.identifyLanguage(bibRecord["rCite"], "eng") ocrPublication(bibRecord["rCite"], language) functions.memexStatusUpdates(settings["path_to_memex"], ".pdf") functions.memexStatusUpdates(settings["path_to_memex"], ".bib") functions.memexStatusUpdates(settings["path_to_memex"], ".png") functions.memexStatusUpdates(settings["path_to_memex"], ".json")
def createIndex(pathToMemex): bibData = functions.loadBib(settings["bib_all"]) with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() completeList = [] for k, v in bibData.items(): path = functions.generatePublPath(memexPath, k) entry = "<tr><td><li><a href=" + "@PATHTOPUBL@/pages/DETAILS.html>" + "[@CITEKEY@]</a></td><td> @AUTHOR@</td> <td>(@DATE@)</td> - <td><i>@TITLE@</i></td></li></tr>" #here I added the <td> in entry = entry.replace("@PATHTOPUBL@", path) entry = entry.replace("@CITEKEY@", k) if "author" in v: entry = entry.replace("@AUTHOR@", v["author"]) else: entry = entry.replace("@AUTHOR@", "MISSING") if "date" in v: entry = entry.replace("@DATE@", v["date"]) else: entry = entry.replace("@DATE@", "MISSING") if "title" in v: entry = entry.replace("@TITLE@", v["title"]) else: entry = entry.replace("@TITLE@", "MISSING") completeList.append(entry) #print each entry into a list content = "\n<ul>\n%s\n</ul>" % "\n".join( sorted(completeList)) #convert the whole contend to a string content = content.replace("{", "") content = content.replace("}", "") toc = formatSearches( pathToMemex ) #table of contend for all the searches html files is allready prepared by Prof template = template.replace("@SEARCHES@", toc) #table for publications template = template.replace( "@PUBLICATIONS@", publTemplate.replace("@TABLECONTENTS@", content) ) #publTable is analog to Profs searchesTemplate. I put all the publications in with open("index.html", "w", encoding="utf8") as f9: f9.write(template)
def processAllRecords(bibDataFile): # load the bib file as dictionary using the function from previous step bibData = functions.loadBib(bibDataFile) # save the keys of the dictionary bibData as a list keys = list(bibData.keys()) random.shuffle(keys) print print(str(keys)) # in a loop, process each key from the list keys (i.e. each record by citation key) for key in keys: bibRecord = bibData[key] # run the function from the previous step that creates a path with pdf and bib files, if not already there functions.processBibRecord(settings["path_to_memex"], bibRecord) language = functions.identifyLanguage(bibRecord, "eng") # run the function that saves ocr-ed text as json files and created .png images for each page ocrPublication(bibRecord["rCite"], language)
def generateContentPage(citeKey, pathToBibFile): print("="*80) print(citeKey) print(pathToBibFile) # load page template with open(settings["template_index"], "r", encoding="utf8") as ft: template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) bibForHTML = bibText.prettifyBib(bibDic[citeKey]["complete"]) pageTemp = template
def formatPublList(pathToMemex): ocrFiles = functions.dicOfRelevantFiles(pathToMemex, settings["ocr_results"]) bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib") contentsList = [] for key, value in ocrFiles.items(): if key in bibFiles: bibRecord = functions.loadBib(bibFiles[key]) bibRecord = bibRecord[key] relativePath = functions.generatePublPath(pathToMemex, key).replace( pathToMemex, "") authorOrEditor = "[No data]" if "editor" in bibRecord: authorOrEditor = bibRecord["editor"] if "author" in bibRecord: authorOrEditor = bibRecord["author"] date = "nodate" if "year" in bibRecord: date = bibRecord["year"] title = bibRecord["title"] # formatting template citeKey = '<div class="ID">[%s]</div>' % key publication = '%s (%s) <i>%s</i>' % (authorOrEditor, date, title) search = unicodedata.normalize('NFKD', publication).encode( 'ascii', 'ignore') publication += " <div class='hidden'>%s</div>" % search link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % ( link, citeKey, publication) recordToAdd = singleItemTemplate.replace("{", "").replace("}", "") contentsList.append(recordToAdd) contents = "\n".join(sorted(contentsList)) final = publicationsTemplate.replace("@TABLECONTENTS@", contents) return (final)
def processAllFiles(pathToMemex): bibData = functions.loadBib(settings["bib_all"]) #loads the bib file languages = yaml.load( open("./_bib/language_keys.yml"), Loader=yaml.FullLoader) #loads the languages from the yaml file for k, v in bibData.items(): try: #goes through the bib file if v["language"] in languages: #if the language is in the yaml file tempLang = languages[v[ "language"]] #take the proper OCR abreviation for the language elif v["language"] not in languages: #if not print a warning print(v["language"] + "is not in the " + languages + "file, please add. Will try with english as default") tempLang = "eng" #default = eng except: tempLang = "eng" #default print(tempLang) ocrPublication(pathToMemex, k, languages)
def generateDoclLink(bibTexCode, pageVal, distance): pathToPubl = functions.generatePublPath(memexPath, bibTexCode) bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) bib = bib[bibTexCode] author = "N.d." if "editor" in bib: author = bib["editor"] if "author" in bib: author = bib["author"] reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"]) search = unicodedata.normalize('NFKD', reference).encode('ascii', 'ignore') search = " <div class='hidden'>%s</div>" % search if pageVal == 0: # link to the start of the publication htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"), "pages", "DETAILS.html") htmlLink = "<a href='{0}'>[{1}]</a>".format(htmlLink, bibTexCode) page = "" startPage = 0 else: startPage = pageVal - 5 endPage = pageVal if startPage == 0: startPage += 1 realStartPage = checkPageNumbers(bib, bibTexCode, startPage) realEndPage = checkPageNumbers(bib, bibTexCode, endPage) htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"), "pages", "%04d.html" % startPage) htmlLink = "<a href='{0}'>[{1},{2}]</a>".format( htmlLink, bibTexCode, realStartPage) page = ", pp. {0}-{1}</i></a>".format(realStartPage, realEndPage) publicationInfo = reference + page + search publicationInfo = publicationInfo.replace("{", "").replace("}", "") singleItemTemplate = '<tr><td data-order="{1}{2:05d}"><div class="ID">{3}</div> {4}</td><td>{0:f}</td></tr>'.format( distance, bibTexCode, startPage, htmlLink, publicationInfo) return (singleItemTemplate)
def processAllRecordsSTR(pathToMemex): files = functions.dicOfRelevantFiles(pathToMemex, ".bib") citeKeys = list(files.keys()) random.shuffle(citeKeys) for citeKey in citeKeys: print(citeKey) bibData = functions.loadBib(files[citeKey]) if "pagetotal" in bibData: pageTotal = int(bibData["pagetotal"]) if pageTotal <= int(settings["page_limit"]): language = functions.identifyLanguage(bibData[citeKey], "eng") ocrPublication(citeKey, language, settings["page_limit"]) else: language = functions.identifyLanguage(bibData[citeKey], "eng") ocrPublication(citeKey, language, settings["page_limit"]) functions.memexStatusUpdates(settings["path_to_memex"], ".pdf") functions.memexStatusUpdates(settings["path_to_memex"], ".bib") functions.memexStatusUpdates(settings["path_to_memex"], ".png") functions.memexStatusUpdates(settings["path_to_memex"], ".json")
def generateReferenceSimple(bibTexCode): #takes the bibTexCode pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) #takes the bibTexCode bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) #loads it bib = bib[bibTexCode] #defines a variable author = "N.d." #no author as default if "editor" in bib: #if editor author = bib["editor"] #add it if "author" in bib: #if author author = bib["author"] #add it date = "N.d." #no date as default if "date" in bib: #if date date = bib["date"] #add it reference = "%s (%s). <i>%s</i>" % ( author, date, bib["title"] ) #takes information about a publication and formats it reference = reference.replace("{", "").replace("}", "") #removes the curly brackets return (reference) #returns it
def generateContentsPage(): # load contents template with open(settings["template_contents"], "r", encoding="utf8") as ft: template = ft.read() # call the function dicOfRelevantFiles with memexPath as input value and save the return value to relDic relDic = functions.dicOfRelevantFiles(memexPath, "bib") # create the list linkList linkList = [] # loop through all items of the dictionry relDic for k, v in relDic.items(): # removing the last character k = k[:-1] # call the function loadBib with v as input value and save the return value to bibDic bibDic = functions.loadBib(v) # append an item (link) to the list linkList linkList.append( "<a href=\"{0}/pages/DETAILS.html\">[{1}]</a> {2} ({3}) - <i>{4}</i>" .format(os.path.join(k[0], k[:2], k), k, bibDic[k]["author"], bibDic[k]["date"], bibDic[k]["title"])) # sort the list linkList linkListSorted = sorted(linkList) # join items of linkListSorted by </li><li> and store in a sting linkList = "</li><li>".join(linkListSorted) # save template to pageTemp pageTemp = template # replace @MAINCONTENT@ with linkList and save it to pageTemp pageTemp = pageTemp.replace("@MAINCONTENT@", linkList) # path to contents.html directory = os.path.join(memexPath, "contents.html") # create the file contents.html with open(directory, "w", encoding="utf8") as f2: f2.write(pageTemp)
def processAllRecords(bibDataFile): #defines a functions for all the records bibData = functions.loadBib( bibDataFile) #loops through key-value-pairs in the bibData-dictionary keys = list(bibData.keys()) #keys from the list random.shuffle(keys) #randomizes the OCRing for key in keys: #loops through the keys bibRecord = bibData[key] #adds a key to the bibData functions.processBibRecord(settings["path_to_memex"], bibRecord) #assigns a new parameter language = functions.identifyLanguage( bibRecord["rCite"], "eng") #identifies a language, assigns the "eng" ocrPublication(bibRecord["rCite"], language, int( settings["page_limit"])) #sets a page limit, if there is such functions.memexStatusUpdates(settings["path_to_memex"], ".pdf") #creates a pdf functions.memexStatusUpdates(settings["path_to_memex"], ".bib") #creates a bib functions.memexStatusUpdates(settings["path_to_memex"], ".png") #creates a png functions.memexStatusUpdates(settings["path_to_memex"], ".json") #creates a jsonfile
def generatePublicationInterface(citeKey, pathToBibFile): print("=" * 80) print(citeKey) jsonFile = pathToBibFile.replace(".bib", ".json") with open(jsonFile, encoding="utf8") as jsonData: ocred = json.load(jsonData) pNums = ocred.keys() pageDic = functions.generatePageLinks(pNums) # load page template with open(settings["template_page"], "r", encoding="utf8") as ft: template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"]) orderedPages = list(pageDic.keys()) for o in range(0, len(orderedPages)): #print(o) k = orderedPages[o] v = pageDic[orderedPages[o]] pageTemp = template pageTemp = pageTemp.replace("@PAGELINKS@", v) pageTemp = pageTemp.replace("@PATHTOFILE@", "") pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey) if k != "DETAILS": mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace( "@PAGEFILE@", "%s.png" % k) pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) pageTemp = pageTemp.replace("@OCREDCONTENT@", ocred[k].replace("\n", "<br>")) else: mainElement = bibForHTML.replace("\n", "<br> ") mainElement = '<div class="bib">%s</div>' % mainElement mainElement += '\n<img src="wordcloud.jpg" width="100%" alt="wordcloud">' pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) pageTemp = pageTemp.replace("@OCREDCONTENT@", "") # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@ if k == "DETAILS": nextPage = "0001.html" prevPage = "" elif k == "0001": nextPage = "0002.html" prevPage = "DETAILS.html" elif o == len(orderedPages) - 1: nextPage = "" prevPage = orderedPages[o - 1] + ".html" else: nextPage = orderedPages[o + 1] + ".html" prevPage = orderedPages[o - 1] + ".html" pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage) pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage) pagePath = os.path.join( pathToBibFile.replace(citeKey + ".bib", ""), "pages", "%s.html" % k) with open(pagePath, "w", encoding="utf8") as f9: f9.write(pageTemp)
# PROCESS ALL RECORDS: APPROACH 2 ######################### ########################################################### # Why this way? Our computers are now quite powerful; they # often have multiple cores and we can take advantage of this; # if we process our data in the manner coded below --- we shuffle # our publications and process them in random order --- we can # run multiple instances fo the same script and data will # be produced in parallel. You can run as many instances as # your machine allows (you need to check how many cores # your machine has). Even running two scripts will cut # processing time roughly in half. def processAllRecords(bibData): #define function, 1 parameter keys = list(bibData.keys()) # define variable bib keys random.shuffle(keys) #allows multiprocessing; every time the function is executed it starts with a different pdf for key in keys: #loop through every individual bib key bibRecord = bibData[key] #store inormation in new variable # 1. create folders, copy files functions.processBibRecord(memexPath, bibRecord) #pre-defined function in functions.py # 2. OCR the file language = identifyLanguage(bibRecord, "eng") #use pre-defined function to determine pdf's language ocrPublication(memexPath, bibRecord["rCite"], language) #use pre-defined function to extract text from images bibData = functions.loadBib(settings["bib_all"]) #pre-defined function in fucntions.py loading bibliography data processAllRecords(bibData) #use pre-defined function to produce and store text images of all pdfs in the bibliography
import functions import yaml #creates a file with the language keys and the count settingsFile = "./settings.yml" settings = yaml.load(open(settingsFile)) bibData = functions.loadBib(settings["bib_all"]) def getLang(bibData): tempDic = {} for k, v in bibData.items(): if v["langid"] in tempDic: tempDic[v["langid"]] += 1 else: tempDic[v["langid"]] = 1 results = [] for k, v in tempDic.items(): result = "%010d\t%s" % (v, k) results.append(result) results = sorted(results, reverse=True) results = "\n".join(results)
def generatePublicationInterface(citeKey, pathToBibFile): print("="*80) print(citeKey) #print the citeKey of the publication jsonFile = pathToBibFile.replace(".bib", ".json") #take the bibFile with open(jsonFile, encoding="utf8") as jsonData: ocred = json.load(jsonData) #load the bibFile pNums = ocred.keys() #take the citation keys pageDic = functions.generatePageLinks(pNums) #load the function which generates links to all pages in a publication # load page template with open(settings["template_page"], "r", encoding="utf8") as ft: template = ft.read() #load the page template # load individual bib record bibFile = pathToBibFile #take the pathToBibFile bibDic = functions.loadBib(bibFile) #load the loadBib-function which loads the bibTex data into a dictionary bibForHTML = bibText.prettifyBib(bibDic[citeKey]["complete"]) #load the prettifyBib-function to make the bib record more readable (taking the complete bib record) orderedPages = list(pageDic.keys()) #create a list of keys to get all page numbers for o in range(0, len(orderedPages)): #loop through the pages #print(o) k = orderedPages[o] #take the number of the page as key v = pageDic[orderedPages[o]] #take the links to the other pages as value pageTemp = template #assign the page template to a temporary variable pageTemp = pageTemp.replace("@PAGELINKS@", v) #replace the Pagelinks item with the links to the other pages pageTemp = pageTemp.replace("@PATHTOFILE@", "") #replace the Pathtofile item with a blank pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey) #replace the Citationkey item with the citation key emptyResults = '<tr><td><i>%s</i></td><td><i>%s</i></td><td><i>%s</i></td></tr>' #create a template for the similarity values if k != "DETAILS": #if the page is not the details page mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace("@PAGEFILE@", "%s.png" % k) #takes the .png-file of the OCRed text of this page pageKey = citeKey+"_%05d" % roundUp(int(k), 5) #take the citationKey and the pageNumbers #print(pageKey) if pageKey in pageConnData: #check if there are any similar pageclusters formattedResults = "\n".join(pageConnData[pageKey]) #add them #input(formattedResults) else: formattedResults = emptyResults % ("no data", "no data", "no data") #add that there are no similar pageclusters mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults) #replace the wildcard in the template with the actual values for simliar texts mainElement += ocrTemplate.replace("@OCREDCONTENTTEMP@", ocred[k].replace("\n", "<br>")) #replace the wildcard in the template with the OCRed text of the page pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) #repace the wildcard with the added actual values else: #if the page is the details page reference = generateReferenceSimple(citeKey) #take the information about the publication we've generated mainElement = "<h3>%s</h3>\n\n" % reference #add it as a header bibElement = '<div class="bib">%s</div>' % bibForHTML.replace("\n", "<br> ") #take the bibliogaphical data bibElement = generalTemplate.replace("@ELEMENTCONTENT@", bibElement) #replace the wildcard in the general template with the bibliographical data bibElement = bibElement.replace("@ELEMENTHEADER@", "BibTeX Bibliographical Record") #add a meaningful description mainElement += bibElement + "\n\n" #add a new line wordCloud = '\n<img src="../' + citeKey + '_wCloud.jpg" width="100%" alt="wordcloud">' #take the wordcloud we've generated wordCloud = generalTemplate.replace("@ELEMENTCONTENT@", wordCloud) #replace the wildcard in the general template with the wordcloud wordCloud = wordCloud.replace("@ELEMENTHEADER@", "WordCloud of Keywords (<i>tf-idf</i>)") #add a meaningful description mainElement += wordCloud + "\n\n" #add a new line if citeKey in publConnData: #check if there are any similar texts formattedResults = "\n".join(publConnData[citeKey]) #add them #input(formattedResults) else: formattedResults = emptyResults % ("no data", "no data", "no data") #add that there are non similar texts mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults) #replace the wildcard in the template with the actual information about similar texts pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) #replace the wildcard in the pagetemplate with the added content # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@ if k == "DETAILS": #if the page is the Details page nextPage = "0001.html" #the next page is the first page of the record prevPage = "" #there is no previous page elif k == "0001": #if the page is the first page of the record nextPage = "0002.html" #the next page is the second page of the record prevPage = "DETAILS.html" #the previous page is the Details page elif o == len(orderedPages)-1: #if the page is the last page of the record nextPage = "" #there is no next page prevPage = orderedPages[o-1] + ".html" #the previous page is the penultimate page of the record else: #for all other pages nextPage = orderedPages[o+1] + ".html" #the next page is the page behind in the record prevPage = orderedPages[o-1] + ".html" #the previous page is the page before in the record pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage) #replace the wildcard with a link to the page assigned in the lines before pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage) #replace the Previouspagehtml item with a link to the page assigned in the lines before pagePath = os.path.join(pathToBibFile.replace(citeKey+".bib", ""), "pages", "%s.html" % k) #create a filepath to each page in the pages-folder of each publication with open(pagePath, "w", encoding="utf8") as f9: f9.write(pageTemp) #create and save each page in that pages folder
def generatePublicationInterface( citeKey, pathToBibFile): # function takes a citation key and path to bib file print("=" * 80) print(citeKey) jsonFile = pathToBibFile.replace(".bib", ".json") with open(jsonFile, encoding="utf8") as jsonData: #add encoding to not get error; ocred = json.load(jsonData) pNums = ocred.keys() pageDic = functions.generatePageLinks( pNums ) # number of pages of each publication;links to make it navigatable # load page template with open(settings["template_page"], "r", encoding="utf8") as ft: template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) #loads entire bib bibForHTML = functions.prettifyBib( bibDic[citeKey] ["complete"]) #makes the bib file look better for this view orderedPages = list( pageDic.keys()) #list of all keys and pagenummers from page dic for o in range( 0, len(orderedPages)): #long loop that creates every single page #print(o) k = orderedPages[o] v = pageDic[orderedPages[o]] pageTemp = template # take a template pageTemp = pageTemp.replace("@PAGELINKS@", v) # replace values in template pageTemp = pageTemp.replace("@PATHTOFILE@", "") pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey) if k != "DETAILS": #one page is different than the rest;this for regular mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace( "@PAGEFILE@", "%s.png" % k) pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) pageTemp = pageTemp.replace("@OCREDCONTENT@", ocred[k].replace("\n", "<br>")) else: # if pages is details.html mainElement = bibForHTML.replace("\n", "<br> ") mainElement = '<div class="bib">%s</div>' % mainElement # class for changes in style sheet mainElement += '\n<img src="wordcloud.jpg" width="100%" alt="wordcloud">' #wordcloud we will generate in the next class pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) pageTemp = pageTemp.replace("@OCREDCONTENT@", "") # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@ #links to next and previous page; and when we are on the last it stops if k == "DETAILS": nextPage = "0001.html" prevPage = "" elif k == "0001": nextPage = "0002.html" prevPage = "DETAILS.html" elif o == len(orderedPages) - 1: nextPage = "" prevPage = orderedPages[o - 1] + ".html" else: nextPage = orderedPages[o + 1] + ".html" prevPage = orderedPages[o - 1] + ".html" pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage) ##find replace in template pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage) pagePath = os.path.join(pathToBibFile.replace( citeKey + ".bib", ""), "pages", "%s.html" % k) # saves the actual page with open(pagePath, "w", encoding="utf8") as f9: f9.write(pageTemp)
def generatePublicationInterface(citeKey, pathToBibFile): print("="*80) print(citeKey) jsonFile = pathToBibFile.replace(".bib", ".json") with open(jsonFile, "r", encoding ="utf8") as jsonData: ocred = json.load(jsonData) pNums = ocred.keys() pageDic = functions.generatePageLinks(pNums) # load page template with open(settings["template_page"], "r", encoding="utf8") as ft: template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"]) orderedPages = list(pageDic.keys()) for o in range(0, len(orderedPages)): #print(o) k = orderedPages[o] #input(k) v = pageDic[orderedPages[o]] pageTemp = template pageTemp = pageTemp.replace("@PAGELINKS@", v) pageTemp = pageTemp.replace("@PATHTOFILE@", "") pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey) emptyResults = '<tr><td><i>%s</i></td><td><i>%s</i></td><td><i>%s</i></td></tr>' if k != "DETAILS": mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace("@PAGEFILE@", "%s.png" % k) pageKey = citeKey+"_%05d" % roundUp(int(k), 5) #print(pageKey) if pageKey in pageConnData: formattedResults = "\n".join(pageConnData[pageKey]) #input(formattedResults) else: formattedResults = emptyResults % ("no data", "no data", "no data") mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults) mainElement += ocrTemplate.replace("@OCREDCONTENTTEMP@", ocred[k].replace("\n", "<br>")) pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) else: reference = generateReferenceSimple(citeKey) mainElement = "<h3>%s</h3>\n\n" % reference bibElement = '<div class="bib">%s</div>' % bibForHTML.replace("\n", "<br> ") bibElement = generalTemplate.replace("@ELEMENTCONTENT@", bibElement) bibElement = bibElement.replace("@ELEMENTHEADER@", "BibTeX Bibliographical Record") mainElement += bibElement + "\n\n" wordCloud = '\n<img src="../' + citeKey + '_wCloud.jpg" width="100%" alt="wordcloud">' wordCloud = generalTemplate.replace("@ELEMENTCONTENT@", wordCloud) wordCloud = wordCloud.replace("@ELEMENTHEADER@", "WordCloud of Keywords (<i>tf-idf</i>)") mainElement += wordCloud + "\n\n" if citeKey in publConnData: formattedResults = "\n".join(publConnData[citeKey]) #input(formattedResults) else: formattedResults = emptyResults % ("no data", "no data", "no data") mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults) pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@ if k == "DETAILS": nextPage = "0001.html" prevPage = "" elif k == "0001": nextPage = "0002.html" prevPage = "DETAILS.html" elif o == len(orderedPages)-1: nextPage = "" prevPage = orderedPages[o-1] + ".html" else: nextPage = orderedPages[o+1] + ".html" prevPage = orderedPages[o-1] + ".html" pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage) pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage) pagePath = os.path.join(pathToBibFile.replace(citeKey+".bib", ""), "pages", "%s.html" % k) with open(pagePath, "w", encoding="utf8") as f9: f9.write(pageTemp)
def generatePublicationInterface(citeKey, pathToBibFile): print("=" * 80) print(citeKey) jsonFile = pathToBibFile.replace(".bib", ".json") with open(jsonFile) as jsonData: ocred = json.load(jsonData) pNums = ocred.keys() pageDic = functions.generatePageLinks(pNums) # load page template with open(settings["template_page"], "r", encoding="utf8") as ft: #add encoding to avoid error template = ft.read() # load individual bib record bibFile = pathToBibFile bibDic = functions.loadBib(bibFile) #loads entire bib bibForHTML = functions.prettifyBib( bibDic[citeKey]["complete"]) #structures the file orderedPages = list( pageDic.keys()) # generates list of all the keys and pgnumbers for o in range(0, len(orderedPages)): #loop to create pages #print(o) k = orderedPages[o] v = pageDic[orderedPages[o]] pageTemp = template pageTemp = pageTemp.replace("@PAGELINKS@", v) #take a template pageTemp = pageTemp.replace("@PATHTOFILE@", "") #replace the values pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey) if k != "DETAILS": # for regular pages; one page is different from the others mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace( "@PAGEFILE@", "%s.png" % k) pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) pageTemp = pageTemp.replace("@OCREDCONTENT@", ocred[k].replace("\n", "<br>")) else: # if the page is html mainElement = bibForHTML.replace("\n", "<br> ") mainElement = '<div class="bib">%s</div>' % mainElement mainElement += '\n<img src="wordcloud.jpg" width="100%" alt="wordcloud">' pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) pageTemp = pageTemp.replace("@OCREDCONTENT@", "") # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@ #link to previous/ next pag; stop on last page if k == "DETAILS": nextPage = "0001.html" prevPage = "" elif k == "0001": nextPage = "0002.html" prevPage = "DETAILS.html" elif o == len(orderedPages) - 1: nextPage = "" prevPage = orderedPages[o - 1] + ".html" else: nextPage = orderedPages[o + 1] + ".html" prevPage = orderedPages[o - 1] + ".html" pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage) pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage) pagePath = os.path.join(pathToBibFile.replace( citeKey + ".bib", ""), "pages", "%s.html" % k) #saves the origin page with open(pagePath, "w", encoding="utf8") as f9: f9.write(pageTemp)