示例#1
0
def generateDoclLink(bibTexCode, pageVal, distance):
    pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode)
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode))
    bib = bib[bibTexCode]

    author = "N.d."
    if "editor" in bib:
        author = bib["editor"]
    if "author" in bib:
        author = bib["author"]

    reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"])
    search = unicodedata.normalize('NFKD', reference).encode('ascii','ignore')
    search = " <div class='hidden'>%s</div>" % search

    if pageVal == 0: # link to the start of the publication
        htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "DETAILS.html")
        htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink)
        page = ""
        startPage = 0
    else:
        startPage = pageVal - 5
        endPage   = pageVal
        if startPage == 0:
            startPage += 1
        htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "%04d.html" % startPage)
        htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink)
        page = ", pdfPp. %d-%d</i></a>" % (startPage, endPage)

    publicationInfo = reference + page + search
    publicationInfo = publicationInfo.replace("{", "").replace("}", "")
    singleItemTemplate = '<tr><td>%s</td><td>%f</td><td data-order="%s%05d">%s</td></tr>' % (htmlLink, distance, bibTexCode, startPage, publicationInfo)

    return(singleItemTemplate)
示例#2
0
def createIndex(pathToMemex):
    bibData = functions.loadBib(settings["bib_all"])
    with open(settings["template_index"], "r", encoding="utf8") as ft:
        template = ft.read()
    completeList = []
    for k, v in bibData.items():
        path = functions.generatePublPath(memexPath, k)
        entry = "<tr><td><li><a href=" + "@PATHTOPUBL@/pages/DETAILS.html>" + "[@CITEKEY@]</a></td><td> @AUTHOR@</td> <td>(@DATE@)</td> - <td><i>@TITLE@</i></td></li></tr>"
        entry = entry.replace("@PATHTOPUBL@", path)
        entry = entry.replace("@CITEKEY@", k)
        if "author" in v:
            entry = entry.replace("@AUTHOR@", v["author"])
        else:
            entry = entry.replace("@AUTHOR@", "MISSING")
        if "year" in v:
            entry = entry.replace("@DATE@", v["year"])
        else:
            entry = entry.replace("@DATE@", "MISSING")
        if "title" in v:
            entry = entry.replace("@TITLE@", v["title"])
        else:
            entry = entry.replace("@TITLE@", "MISSING")
        completeList.append(entry)
    content = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(completeList))
    content = content.replace("{", "")
    content = content.replace("}", "")
    toc = formatSearches(pathToMemex)

    template = template.replace("@SEARCHES@", toc)
    template = template.replace(
        "@PUBLICATIONS@", publTemplate.replace("@TABLECONTENTS@", content))
    with open(os.path.join(pathToMemex, "searchesInterface.html"),
              "w",
              encoding="utf8") as f9:
        f9.write(template)
示例#3
0
def generateMemexStartingPages(pathToMemex):
    # load index template
    with open(settings["template_index"], "r", encoding="utf8") as ft:
        template = ft.read()

    # add index.html
    with open(settings["content_index"], "r", encoding="utf8") as fi:
        indexData = fi.read()
        with open(os.path.join(pathToMemex, "index.html"), "w", encoding="utf8") as f9:
            f9.write(template.replace("@MAINCONTENT@", indexData))

    # load bibliographical data for processing
    publicationDic = {} # key = citationKey; value = recordDic

    for subdir, dirs, files in os.walk(pathToMemex):
        for file in files:
            if file.endswith(".bib"):
                pathWhereBibIs = os.path.join(subdir, file)
                tempDic = functions.loadBib(pathWhereBibIs)
                publicationDic.update(tempDic)

    # generate data for the main CONTENTS
    singleItemTemplate = '<li><a href="@RELATIVEPATH@/pages/DETAILS.html">[@CITATIONKEY@]</a> @AUTHOROREDITOR@ (@DATE@) - <i>@TITLE@</i></li>'
    contentsList = []

    for citeKey,bibRecord in publicationDic.items():
        relativePath = functions.generatePublPath(pathToMemex, citeKey).replace(pathToMemex, "")

        authorOrEditor = "[No data]"
        if "editor" in bibRecord:
            authorOrEditor = bibRecord["editor"]
        if "author" in bibRecord:
            authorOrEditor = bibRecord["author"]
        if "date" in bibRecord:
            date = bibRecord["date"]
        else:
            print("nodate")

        date = bibRecord["date"][:4]

        title = bibRecord["title"]

        # forming a record
        recordToAdd = singleItemTemplate
        recordToAdd = recordToAdd.replace("@RELATIVEPATH@", relativePath)
        recordToAdd = recordToAdd.replace("@CITATIONKEY@", citeKey)
        recordToAdd = recordToAdd.replace("@AUTHOROREDITOR@", authorOrEditor)
        recordToAdd = recordToAdd.replace("@DATE@", date)
        recordToAdd = recordToAdd.replace("@TITLE@", title)

        recordToAdd = recordToAdd.replace("{", "").replace("}", "")

        contentsList.append(recordToAdd)

    contents = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(contentsList))
    mainContent = "<h1>CONTENTS of MEMEX</h1>\n\n" + contents

    # save the CONTENTS page
    with open(os.path.join(pathToMemex, "contents.html"), "w", encoding="utf8") as f9:
        f9.write(template.replace("@MAINCONTENT@", mainContent))
示例#4
0
def processAllEntries(pathToMemex):    
    bibData = functions.loadBib(settings["bib_all"])    #loads the bib file    
    
    for k,v in bibData.items():              
        path = functions.generatePublPath(memexPath, k)
        path = path + "\\" + k +".bib" 
        interface.generatePublicationInterface(k, path) 
示例#5
0
def processAllFiles(pathToMemex):

    bibData = functions.loadBib(settings["bib_all"])  #loads the bib file

    for k, v in bibData.items():
        lang = checkLangId(v, defaultLang)
        functions.ocrPublication(pathToMemex, k, lang)
def generateDoclLink(bibTexCode, pageVal, distance):
    pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) #take the bibTex-Code
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) #load the bibTex-Code
    bib = bib[bibTexCode] #define a variable

    author = "N.d." #take no information on the author as default setting
    if "editor" in bib: #check if there is information about the editor
        author = bib["editor"] #insert it
    if "author" in bib: #check if there is information about the author
        author = bib["author"] #insert it

    reference = "%s (%s). <i>%s</i>" % (author, bib["year"][:4], bib["title"]) #take information about a publication and format it
    search = unicodedata.normalize('NFKD', reference).encode('ascii','ignore') #replace diacritical characters with their ascii equivalents
    search = " <div class='hidden'>%s</div>" % search #repeat the information and hide it

    if pageVal == 0: # link to the start of the publication
        htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "DETAILS.html") #create an html-link to the details page
        htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) #add the link
        page = "" #define the variable page
        startPage = 0 #define the startPage as 0
    else:
        startPage = pageVal - 5 #define the startPage
        endPage   = pageVal #define the endPage
        if startPage == 0: #if the startPage is the details page
            startPage += 1 #add one to the startPage
        htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "%04d.html" % startPage) #create an html-link to the startPage
        htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink) #add the html-page
        page = ", pdfPp. %d-%d</i></a>" % (startPage, endPage) #add the pagecluster with startPage and endPage

    publicationInfo = reference + page + search #join the variables together
    publicationInfo = publicationInfo.replace("{", "").replace("}", "") #remove the curly brackets
    singleItemTemplate = '<tr><td>%s</td><td>%f</td><td data-order="%s%05d">%s</td></tr>' % (htmlLink, distance, bibTexCode, startPage, publicationInfo) #create a template for the indvidual item

    return(singleItemTemplate) #return this variable
示例#7
0
def generateContentsList():
    relDic = functions.dicOfRelevantFiles(memexPath, "bib")
    contentsList = []

    for k, v in relDic.items():
        k = k[:-1]
        bibDic = functions.loadBib(v)

        authorOrEditor = "[No data]"
        if "editor" in bibDic[k]:
            authorOrEditor = bibDic[k]["editor"]
        if "author" in bibDic[k]:
            authorOrEditor = bibDic[k]["author"]

        publication = "{0} ({1}) <i>{2}</i>".format(authorOrEditor,
                                                    bibDic[k]["date"],
                                                    bibDic[k]["title"])
        search = unicodedata.normalize('NFKD',
                                       publication).encode('ascii', 'ignore')
        publication += " <div class=\"hidden\">{0}</div>".format(search)
        contentsList.append(
            "<tr><td><div class=\"ID\"><a href=\"{0}/pages/DETAILS.html\">[{1}]</a></div> {2}</td></tr>"
            .format(os.path.join(k[0], k[:2], k), k, publication))

    contentsListSorted = sorted(contentsList)
    contentsList = "".join(contentsListSorted)
    mainElement = publicationsTemplate.replace("@TABLECONTENTS@", contentsList)
    return (mainElement)
def formatPublList(
        pathToMemex
):  #define a function for the formatting of the publications
    ocrFiles = functions.dicOfRelevantFiles(
        pathToMemex,
        settings["ocr_results"])  #take the files with the OCRed pages
    bibFiles = functions.dicOfRelevantFiles(pathToMemex,
                                            ".bib")  #take the bibFiles

    contentsList = []  #create an empty list

    for key, value in ocrFiles.items():  #loop through the OCRed pages
        if key in bibFiles:  #search for the key in the bibFile
            bibRecord = functions.loadBib(
                bibFiles[key])  #load the bibliographical data for this item
            bibRecord = bibRecord[key]  #take the key

            relativePath = functions.generatePublPath(
                pathToMemex,
                key).replace(pathToMemex,
                             "")  #take the relative path to the publication

            authorOrEditor = "[No data]"  #take no information on the author as default setting
            if "editor" in bibRecord:  #check if there is information about the editor
                authorOrEditor = bibRecord["editor"]  #insert it
            if "author" in bibRecord:  #check if there is information about the author
                authorOrEditor = bibRecord["author"]  #insert it

            date = bibRecord["year"][:4]  #insert the year of the publication
            title = bibRecord["title"]  #insert the title

            # formatting template
            citeKey = '<div class="ID">[%s]</div>' % key  #take the citeKey
            publication = '%s (%s) <i>%s</i>' % (
                authorOrEditor, date, title
            )  #take the information about the publication and format it
            search = unicodedata.normalize('NFKD', publication).encode(
                'ascii', 'ignore'
            )  #replace diacritical characters with their ascii equivalents
            publication += " <div class='hidden'>%s</div>" % search  #repeat the information and hide it
            link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath  #add the link to the details page of each publication

            singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % (
                link, citeKey, publication
            )  #collect the information in a single template
            recordToAdd = singleItemTemplate.replace("{", "").replace(
                "}", "")  #remove curly brackets

            contentsList.append(
                recordToAdd)  #add the single records to the content list

    contents = "\n".join(sorted(contentsList))  #join the sorted content list
    final = publicationsTemplate.replace(
        "@TABLECONTENTS@", contents
    )  #replace the wildcard in the template with the actual content
    return (final)  #return this variable
示例#9
0
def generateContentPage():

    # #
    ####
    #### Pseudocode:
    #### funkion:
    #### template öffnen
    #### @PATHTOPUBL@, [@CITEKEY@], @AUTHOR@ (@DATE@) @TITLE@ + Link zur details(.html) des Textes
    #### dictionary mit diesen variablen
    #### loop um jeweils einen citekey/text/
    #### Input: citekey,Path zum file, usw. -> Output Liste

    ###
    # load individual bib record
    # bibFile = pathToBibFile
    #bibDic = functions.loadBib(bibFile)
    #bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"])

    ####
    #pageTemp = template
    #pageTemp = pageTemp.replace("@PATHTOPUBL@", v)
    #pageTemp = pageTemp.replace("@CITEKEY@", v)
    #pageTemp = pageTemp.replace("@AUTHOR@", v)
    #pageTemp = pageTemp.replace("@DATE@", v)
    #pageTemp = pageTemp.replace("@TITLE@", v)

    detailfileDic = functions.generatePageLinks(pNums)
    ###        # load page template # wir brauchen template_index.html
    with open(settings["template_index"], "r", encoding="utf8") as ft:
        template = ft.read()

    # load page template
    #with open(settings["template_page"], "r", encoding="utf8") as ft:
    #template = ft.read()

    # load individual bib record
    bibFile = pathToBibFile
    bibDic = functions.loadBib(bibFile)
    bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"])

    orderedPages = list(detailfileDic.keys())

    for o in range(0, len(orderedPages)):
        #print(o)
        k = orderedPages[o]
        v = pageDic[orderedPages[o]]

        pageTemp = template
        pageTemp = pageTemp.replace("@PAGELINKS@", v)
        pageTemp = pageTemp.replace("@PATHTOFILE@", "")
        pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)
示例#10
0
def generateContents (...):
    #generate a list of publications with links 
    #join it with the index page template
    # shoud look like this: <li><a href="@PATHTOPUBL@/pages/DETAILS.html">[@CITEKEY@]</a> @AUTHOR@ (@DATE@) - <i>@TITLE@</i></li>
    #load the bib file to get all the values
    # load the index template file 
    # write the <li> etc. into the content 
    # replace @author with author etc.
    # open as contents.html 

    # #
####
#### Pseudocode: 
#### funkion: 
#### template öffnen
#### @PATHTOPUBL@, [@CITEKEY@], @AUTHOR@ (@DATE@) @TITLE@ + Link zur details(.html) des Textes
#### dictionary mit diesen variablen 
#### loop um jeweils einen citekey/text/
#### Input: citekey,Path zum file, usw. -> Output Liste
###         
        # load individual bib record
       # bibFile = pathToBibFile
        #bibDic = functions.loadBib(bibFile)
        #bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"])
####                
         #   pageTemp = template
          #  pageTemp = pageTemp.replace("@PATHTOPUBL@", v)
            #pageTemp = pageTemp.replace("@CITEKEY@", v)
           #pageTemp = pageTemp.replace("@AUTHOR@", v)
            #pageTemp = pageTemp.replace("@DATE@", v)
            #pageTemp = pageTemp.replace("@TITLE@", v)
        detailfileDic = functions.generatePageLinks(pNums)
###        # load page template # wir brauchen template_index.html
        with open(settings["template_index"], "r", encoding="utf8") as ft:
            template = ft.read()
        # load page template
        #with open(settings["template_page"], "r", encoding="utf8") as ft:
            #template = ft.read()
        # load individual bib record
        bibFile = pathToBibFile
        bibDic = functions.loadBib(bibFile)
        bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"])
        orderedPages = list(detailfileDic.keys())
        for o in range(0, len(orderedPages)):
            #print(o)
            k = orderedPages[o]
            v = pageDic[orderedPages[o]]
            pageTemp = template
            pageTemp = pageTemp.replace("@PAGELINKS@", v)
            pageTemp = pageTemp.replace("@PATHTOFILE@", "")
            pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)
示例#11
0
def generateReferenceSimple(bibTexCode):
    pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode)
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode))
    bib = bib[bibTexCode]

    author = "N.d."
    if "editor" in bib:
        author = bib["editor"]
    if "author" in bib:
        author = bib["author"]

    reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"])
    reference = reference.replace("{", "").replace("}", "")
    return(reference)
def generateReferenceSimple(bibTexCode):
    pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode) #take the bibTexCode
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode)) #load the bibTexCode
    bib = bib[bibTexCode] #define a variable

    author = "N.d." #take no information on the author as default setting
    if "editor" in bib: #check if there is information about the editor
        author = bib["editor"] #insert it
    if "author" in bib: #check if there is information about the author
        author = bib["author"] #insert it

    reference = "%s (%s). <i>%s</i>" % (author, bib["year"][:4], bib["title"]) #take information about a publication and format it
    reference = reference.replace("{", "").replace("}", "") #remove the curly brackets
    return(reference) #return this variable
示例#13
0
def processAllRecords(bibDataFile):
    bibData = functions.loadBib(bibDataFile)
    keys = list(bibData.keys())
    random.shuffle(keys)

    for key in keys:
        bibRecord = bibData[key]
        functions.processBibRecord(settings["path_to_memex"], bibRecord)
        language = functions.identifyLanguage(bibRecord["rCite"], "eng")
        ocrPublication(bibRecord["rCite"], language)

    functions.memexStatusUpdates(settings["path_to_memex"], ".pdf")
    functions.memexStatusUpdates(settings["path_to_memex"], ".bib")
    functions.memexStatusUpdates(settings["path_to_memex"], ".png")
    functions.memexStatusUpdates(settings["path_to_memex"], ".json")
示例#14
0
def createIndex(pathToMemex):

    bibData = functions.loadBib(settings["bib_all"])

    with open(settings["template_index"], "r", encoding="utf8") as ft:
        template = ft.read()

    completeList = []

    for k, v in bibData.items():
        path = functions.generatePublPath(memexPath, k)

        entry = "<tr><td><li><a href=" + "@PATHTOPUBL@/pages/DETAILS.html>" + "[@CITEKEY@]</a></td><td> @AUTHOR@</td> <td>(@DATE@)</td> - <td><i>@TITLE@</i></td></li></tr>"  #here I added the <td> in

        entry = entry.replace("@PATHTOPUBL@", path)
        entry = entry.replace("@CITEKEY@", k)
        if "author" in v:
            entry = entry.replace("@AUTHOR@", v["author"])
        else:
            entry = entry.replace("@AUTHOR@", "MISSING")
        if "date" in v:
            entry = entry.replace("@DATE@", v["date"])
        else:
            entry = entry.replace("@DATE@", "MISSING")
        if "title" in v:
            entry = entry.replace("@TITLE@", v["title"])
        else:
            entry = entry.replace("@TITLE@", "MISSING")

        completeList.append(entry)  #print each entry into a list

    content = "\n<ul>\n%s\n</ul>" % "\n".join(
        sorted(completeList))  #convert the whole contend to a string
    content = content.replace("{", "")
    content = content.replace("}", "")

    toc = formatSearches(
        pathToMemex
    )  #table of contend for all the searches html files is allready prepared by Prof
    template = template.replace("@SEARCHES@", toc)

    #table for publications
    template = template.replace(
        "@PUBLICATIONS@", publTemplate.replace("@TABLECONTENTS@", content)
    )  #publTable is analog to Profs searchesTemplate. I put all the publications in

    with open("index.html", "w", encoding="utf8") as f9:
        f9.write(template)
示例#15
0
def processAllRecords(bibDataFile):
    # load the bib file as dictionary using the function from previous step
    bibData = functions.loadBib(bibDataFile)
    # save the keys of the dictionary bibData as a list
    keys = list(bibData.keys())
    random.shuffle(keys)
    print
    print(str(keys))
    # in a loop, process each key from the list keys (i.e. each record by citation key)
    for key in keys:
        bibRecord = bibData[key]
        # run the function from the previous step that creates a path with pdf and bib files, if not already there
        functions.processBibRecord(settings["path_to_memex"], bibRecord)
        language = functions.identifyLanguage(bibRecord, "eng")
        # run the function that saves ocr-ed text as json files and created .png images for each page
        ocrPublication(bibRecord["rCite"], language)
def generateContentPage(citeKey, pathToBibFile):

    print("="*80)
    print(citeKey)
    print(pathToBibFile)

    # load page template
    with open(settings["template_index"], "r", encoding="utf8") as ft:
            template = ft.read()
    
    # load individual bib record
    bibFile = pathToBibFile
    bibDic = functions.loadBib(bibFile)
    bibForHTML = bibText.prettifyBib(bibDic[citeKey]["complete"])

    pageTemp = template
示例#17
0
def formatPublList(pathToMemex):
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex,
                                            settings["ocr_results"])
    bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib")

    contentsList = []

    for key, value in ocrFiles.items():
        if key in bibFiles:
            bibRecord = functions.loadBib(bibFiles[key])
            bibRecord = bibRecord[key]

            relativePath = functions.generatePublPath(pathToMemex,
                                                      key).replace(
                                                          pathToMemex, "")

            authorOrEditor = "[No data]"
            if "editor" in bibRecord:
                authorOrEditor = bibRecord["editor"]
            if "author" in bibRecord:
                authorOrEditor = bibRecord["author"]

            date = "nodate"
            if "year" in bibRecord:
                date = bibRecord["year"]

            title = bibRecord["title"]

            # formatting template
            citeKey = '<div class="ID">[%s]</div>' % key
            publication = '%s (%s) <i>%s</i>' % (authorOrEditor, date, title)
            search = unicodedata.normalize('NFKD', publication).encode(
                'ascii', 'ignore')
            publication += " <div class='hidden'>%s</div>" % search
            link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath

            singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % (
                link, citeKey, publication)
            recordToAdd = singleItemTemplate.replace("{", "").replace("}", "")

            contentsList.append(recordToAdd)

    contents = "\n".join(sorted(contentsList))
    final = publicationsTemplate.replace("@TABLECONTENTS@", contents)

    return (final)
def processAllFiles(pathToMemex):
    bibData = functions.loadBib(settings["bib_all"])  #loads the bib file
    languages = yaml.load(
        open("./_bib/language_keys.yml"),
        Loader=yaml.FullLoader)  #loads the languages from the yaml file
    for k, v in bibData.items():
        try:  #goes through the bib file
            if v["language"] in languages:  #if the language is in the yaml file
                tempLang = languages[v[
                    "language"]]  #take the proper OCR abreviation for the language
            elif v["language"] not in languages:  #if not print a warning
                print(v["language"] + "is not in the " + languages +
                      "file, please add. Will try with english as default")
                tempLang = "eng"  #default = eng
        except:
            tempLang = "eng"  #default
            print(tempLang)
        ocrPublication(pathToMemex, k, languages)
示例#19
0
def generateDoclLink(bibTexCode, pageVal, distance):
    pathToPubl = functions.generatePublPath(memexPath, bibTexCode)
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode))
    bib = bib[bibTexCode]

    author = "N.d."
    if "editor" in bib:
        author = bib["editor"]
    if "author" in bib:
        author = bib["author"]

    reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"])
    search = unicodedata.normalize('NFKD', reference).encode('ascii', 'ignore')
    search = " <div class='hidden'>%s</div>" % search

    if pageVal == 0:  # link to the start of the publication
        htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"),
                                "pages", "DETAILS.html")
        htmlLink = "<a href='{0}'>[{1}]</a>".format(htmlLink, bibTexCode)
        page = ""
        startPage = 0
    else:
        startPage = pageVal - 5
        endPage = pageVal
        if startPage == 0:
            startPage += 1

        realStartPage = checkPageNumbers(bib, bibTexCode, startPage)
        realEndPage = checkPageNumbers(bib, bibTexCode, endPage)

        htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"),
                                "pages", "%04d.html" % startPage)
        htmlLink = "<a href='{0}'>[{1},{2}]</a>".format(
            htmlLink, bibTexCode, realStartPage)
        page = ", pp. {0}-{1}</i></a>".format(realStartPage, realEndPage)

    publicationInfo = reference + page + search
    publicationInfo = publicationInfo.replace("{", "").replace("}", "")
    singleItemTemplate = '<tr><td data-order="{1}{2:05d}"><div class="ID">{3}</div> {4}</td><td>{0:f}</td></tr>'.format(
        distance, bibTexCode, startPage, htmlLink, publicationInfo)

    return (singleItemTemplate)
示例#20
0
def processAllRecordsSTR(pathToMemex):
    files = functions.dicOfRelevantFiles(pathToMemex, ".bib")
    citeKeys = list(files.keys())
    random.shuffle(citeKeys)

    for citeKey in citeKeys:
        print(citeKey)
        bibData = functions.loadBib(files[citeKey])
        if "pagetotal" in bibData:
            pageTotal = int(bibData["pagetotal"])
            if pageTotal <= int(settings["page_limit"]):
                language = functions.identifyLanguage(bibData[citeKey], "eng")
                ocrPublication(citeKey, language, settings["page_limit"])
        else:
            language = functions.identifyLanguage(bibData[citeKey], "eng")
            ocrPublication(citeKey, language, settings["page_limit"])

    functions.memexStatusUpdates(settings["path_to_memex"], ".pdf")
    functions.memexStatusUpdates(settings["path_to_memex"], ".bib")
    functions.memexStatusUpdates(settings["path_to_memex"], ".png")
    functions.memexStatusUpdates(settings["path_to_memex"], ".json")
def generateReferenceSimple(bibTexCode):  #takes the bibTexCode
    pathToPubl = functions.generatePublPath(settings["path_to_memex"],
                                            bibTexCode)  #takes the bibTexCode
    bib = functions.loadBib(os.path.join(pathToPubl,
                                         "%s.bib" % bibTexCode))  #loads it
    bib = bib[bibTexCode]  #defines a variable

    author = "N.d."  #no author as default
    if "editor" in bib:  #if editor
        author = bib["editor"]  #add it
    if "author" in bib:  #if author
        author = bib["author"]  #add it
    date = "N.d."  #no date as default
    if "date" in bib:  #if date
        date = bib["date"]  #add it
    reference = "%s (%s). <i>%s</i>" % (
        author, date, bib["title"]
    )  #takes information about a publication and formats it
    reference = reference.replace("{",
                                  "").replace("}",
                                              "")  #removes the curly brackets
    return (reference)  #returns it
示例#22
0
def generateContentsPage():
    # load contents template
    with open(settings["template_contents"], "r", encoding="utf8") as ft:
        template = ft.read()

    # call the function dicOfRelevantFiles with memexPath as input value and save the return value to relDic
    relDic = functions.dicOfRelevantFiles(memexPath, "bib")
    # create the list linkList
    linkList = []

    # loop through all items of the dictionry relDic
    for k, v in relDic.items():
        # removing the last character
        k = k[:-1]
        # call the function loadBib with v as input value and save the return value to bibDic
        bibDic = functions.loadBib(v)
        # append an item (link) to the list linkList
        linkList.append(
            "<a href=\"{0}/pages/DETAILS.html\">[{1}]</a> {2} ({3}) - <i>{4}</i>"
            .format(os.path.join(k[0], k[:2], k), k, bibDic[k]["author"],
                    bibDic[k]["date"], bibDic[k]["title"]))
    # sort the list linkList
    linkListSorted = sorted(linkList)
    # join items of linkListSorted by </li><li> and store in a sting
    linkList = "</li><li>".join(linkListSorted)

    # save template to pageTemp
    pageTemp = template
    # replace @MAINCONTENT@ with linkList and save it to pageTemp
    pageTemp = pageTemp.replace("@MAINCONTENT@", linkList)

    # path to contents.html
    directory = os.path.join(memexPath, "contents.html")
    # create the file contents.html
    with open(directory, "w", encoding="utf8") as f2:
        f2.write(pageTemp)
示例#23
0
def processAllRecords(bibDataFile):  #defines a functions for all the records
    bibData = functions.loadBib(
        bibDataFile)  #loops through key-value-pairs in the bibData-dictionary
    keys = list(bibData.keys())  #keys from the list
    random.shuffle(keys)  #randomizes the OCRing

    for key in keys:  #loops through the keys
        bibRecord = bibData[key]  #adds a key to the bibData
        functions.processBibRecord(settings["path_to_memex"],
                                   bibRecord)  #assigns a new parameter
        language = functions.identifyLanguage(
            bibRecord["rCite"],
            "eng")  #identifies a language, assigns the "eng"
        ocrPublication(bibRecord["rCite"], language, int(
            settings["page_limit"]))  #sets a page limit, if there is such

    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".pdf")  #creates a pdf
    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".bib")  #creates a bib
    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".png")  #creates a png
    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".json")  #creates a jsonfile
def generatePublicationInterface(citeKey, pathToBibFile):
    print("=" * 80)
    print(citeKey)

    jsonFile = pathToBibFile.replace(".bib", ".json")
    with open(jsonFile, encoding="utf8") as jsonData:
        ocred = json.load(jsonData)
        pNums = ocred.keys()

        pageDic = functions.generatePageLinks(pNums)

        # load page template
        with open(settings["template_page"], "r", encoding="utf8") as ft:
            template = ft.read()

        # load individual bib record
        bibFile = pathToBibFile
        bibDic = functions.loadBib(bibFile)
        bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"])

        orderedPages = list(pageDic.keys())

        for o in range(0, len(orderedPages)):
            #print(o)
            k = orderedPages[o]
            v = pageDic[orderedPages[o]]

            pageTemp = template
            pageTemp = pageTemp.replace("@PAGELINKS@", v)
            pageTemp = pageTemp.replace("@PATHTOFILE@", "")
            pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)

            if k != "DETAILS":
                mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace(
                    "@PAGEFILE@", "%s.png" % k)
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
                pageTemp = pageTemp.replace("@OCREDCONTENT@",
                                            ocred[k].replace("\n", "<br>"))
            else:
                mainElement = bibForHTML.replace("\n", "<br> ")
                mainElement = '<div class="bib">%s</div>' % mainElement
                mainElement += '\n<img src="wordcloud.jpg" width="100%" alt="wordcloud">'
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
                pageTemp = pageTemp.replace("@OCREDCONTENT@", "")

            # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@
            if k == "DETAILS":
                nextPage = "0001.html"
                prevPage = ""
            elif k == "0001":
                nextPage = "0002.html"
                prevPage = "DETAILS.html"
            elif o == len(orderedPages) - 1:
                nextPage = ""
                prevPage = orderedPages[o - 1] + ".html"
            else:
                nextPage = orderedPages[o + 1] + ".html"
                prevPage = orderedPages[o - 1] + ".html"

            pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage)
            pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage)

            pagePath = os.path.join(
                pathToBibFile.replace(citeKey + ".bib", ""), "pages",
                "%s.html" % k)
            with open(pagePath, "w", encoding="utf8") as f9:
                f9.write(pageTemp)
示例#25
0
# PROCESS ALL RECORDS: APPROACH 2 #########################
###########################################################

# Why this way? Our computers are now quite powerful; they
# often have multiple cores and we can take advantage of this;
# if we process our data in the manner coded below --- we shuffle
# our publications and process them in random order --- we can
# run multiple instances fo the same script and data will
# be produced in parallel. You can run as many instances as
# your machine allows (you need to check how many cores
# your machine has). Even running two scripts will cut
# processing time roughly in half.

def processAllRecords(bibData): #define function, 1 parameter
    keys = list(bibData.keys()) # define variable bib keys
    random.shuffle(keys) #allows multiprocessing; every time the function is executed it starts with a different pdf

    for key in keys: #loop through every individual bib key
        bibRecord = bibData[key] #store inormation in new variable

        # 1. create folders, copy files
        functions.processBibRecord(memexPath, bibRecord) #pre-defined function in functions.py

        # 2. OCR the file
        language = identifyLanguage(bibRecord, "eng") #use pre-defined function to determine pdf's language
        ocrPublication(memexPath, bibRecord["rCite"], language) #use pre-defined function to extract text from images


bibData = functions.loadBib(settings["bib_all"]) #pre-defined function in fucntions.py loading bibliography data
processAllRecords(bibData) #use pre-defined function to produce and store text images of all pdfs in the bibliography
示例#26
0
import functions
import yaml

#creates a file with the language keys and the count

settingsFile = "./settings.yml"
settings = yaml.load(open(settingsFile))

bibData = functions.loadBib(settings["bib_all"])


def getLang(bibData):

    tempDic = {}

    for k, v in bibData.items():

        if v["langid"] in tempDic:
            tempDic[v["langid"]] += 1

        else:
            tempDic[v["langid"]] = 1

    results = []

    for k, v in tempDic.items():
        result = "%010d\t%s" % (v, k)
        results.append(result)

    results = sorted(results, reverse=True)
    results = "\n".join(results)
def generatePublicationInterface(citeKey, pathToBibFile):
    print("="*80)
    print(citeKey) #print the citeKey of the publication

    jsonFile = pathToBibFile.replace(".bib", ".json") #take the bibFile
    with open(jsonFile, encoding="utf8") as jsonData:
        ocred = json.load(jsonData) #load the bibFile
        pNums = ocred.keys() #take the citation keys
        pageDic = functions.generatePageLinks(pNums) #load the function which generates links to all pages in a publication

        # load page template
        with open(settings["template_page"], "r", encoding="utf8") as ft:
            template = ft.read() #load the page template

        # load individual bib record
        bibFile = pathToBibFile #take the pathToBibFile
        bibDic = functions.loadBib(bibFile) #load the loadBib-function which loads the bibTex data into a dictionary
        bibForHTML = bibText.prettifyBib(bibDic[citeKey]["complete"]) #load the prettifyBib-function to make the bib record more readable (taking the complete bib record)

        orderedPages = list(pageDic.keys()) #create a list of keys to get all page numbers

        for o in range(0, len(orderedPages)): #loop through the pages
            #print(o)
            k = orderedPages[o] #take the number of the page as key
            v = pageDic[orderedPages[o]] #take the links to the other pages as value

            pageTemp = template #assign the page template to a temporary variable
            pageTemp = pageTemp.replace("@PAGELINKS@", v) #replace the Pagelinks item with the links to the other pages
            pageTemp = pageTemp.replace("@PATHTOFILE@", "") #replace the Pathtofile item with a blank
            pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey) #replace the Citationkey item with the citation key

            emptyResults = '<tr><td><i>%s</i></td><td><i>%s</i></td><td><i>%s</i></td></tr>' #create a template for the similarity values

           if k != "DETAILS": #if the page is not the details page
                mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace("@PAGEFILE@", "%s.png" % k) #takes the .png-file of the OCRed text of this page

                pageKey = citeKey+"_%05d" % roundUp(int(k), 5) #take the citationKey and the pageNumbers
                #print(pageKey)
                if pageKey in pageConnData: #check if there are any similar pageclusters
                    formattedResults = "\n".join(pageConnData[pageKey]) #add them 
                    #input(formattedResults)
                else:
                    formattedResults = emptyResults % ("no data", "no data", "no data") #add that there are no similar pageclusters

                mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults) #replace the wildcard in the template with the actual values for simliar texts
                mainElement += ocrTemplate.replace("@OCREDCONTENTTEMP@", ocred[k].replace("\n", "<br>")) #replace the wildcard in the template with the OCRed text of the page
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) #repace the wildcard with the added actual values
            else: #if the page is the details page
                reference = generateReferenceSimple(citeKey) #take the information about the publication we've generated
                mainElement = "<h3>%s</h3>\n\n" % reference #add it as a header

                bibElement = '<div class="bib">%s</div>' % bibForHTML.replace("\n", "<br> ") #take the bibliogaphical data
                bibElement = generalTemplate.replace("@ELEMENTCONTENT@", bibElement) #replace the wildcard in the general template with the bibliographical data
                bibElement = bibElement.replace("@ELEMENTHEADER@", "BibTeX Bibliographical Record") #add a meaningful description
                mainElement += bibElement + "\n\n" #add a new line

                wordCloud = '\n<img src="../' + citeKey + '_wCloud.jpg" width="100%" alt="wordcloud">' #take the wordcloud we've generated
                wordCloud = generalTemplate.replace("@ELEMENTCONTENT@", wordCloud) #replace the wildcard in the general template with the wordcloud
                wordCloud = wordCloud.replace("@ELEMENTHEADER@", "WordCloud of Keywords (<i>tf-idf</i>)") #add a meaningful description
                mainElement += wordCloud + "\n\n" #add a new line

                if citeKey in publConnData: #check if there are any similar texts
                    formattedResults = "\n".join(publConnData[citeKey]) #add them
                    #input(formattedResults)
                else:
                    formattedResults = emptyResults % ("no data", "no data", "no data") #add that there are non similar texts

                mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults) #replace the wildcard in the template with the actual information about similar texts


                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement) #replace the wildcard in the pagetemplate with the added content

            # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@
             if k == "DETAILS": #if the page is the Details page
                nextPage = "0001.html" #the next page is the first page of the record
                prevPage = "" #there is no previous page
            elif k == "0001": #if the page is the first page of the record
                nextPage = "0002.html" #the next page is the second page of the record
                prevPage = "DETAILS.html" #the previous page is the Details page
            elif o == len(orderedPages)-1: #if the page is the last page of the record
                nextPage = "" #there is no next page
                prevPage = orderedPages[o-1] + ".html" #the previous page is the penultimate page of the record
            else: #for all other pages
                nextPage = orderedPages[o+1] + ".html" #the next page is the page behind in the record
                prevPage = orderedPages[o-1] + ".html" #the previous page is the page before in the record

            pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage) #replace the wildcard with a link to the page assigned in the lines before
            pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage) #replace the Previouspagehtml item with a link to the page assigned in the lines before

            pagePath = os.path.join(pathToBibFile.replace(citeKey+".bib", ""), "pages", "%s.html" % k) #create a filepath to each page in the pages-folder of each publication
            with open(pagePath, "w", encoding="utf8") as f9:
                f9.write(pageTemp) #create and save each page in that pages folder
示例#28
0
def generatePublicationInterface(
        citeKey,
        pathToBibFile):  # function takes a citation key and path to bib file
    print("=" * 80)
    print(citeKey)

    jsonFile = pathToBibFile.replace(".bib", ".json")
    with open(jsonFile,
              encoding="utf8") as jsonData:  #add encoding to not get error;
        ocred = json.load(jsonData)
        pNums = ocred.keys()

        pageDic = functions.generatePageLinks(
            pNums
        )  # number of pages of each publication;links to make it navigatable

        # load page template
        with open(settings["template_page"], "r", encoding="utf8") as ft:
            template = ft.read()

        # load individual bib record
        bibFile = pathToBibFile
        bibDic = functions.loadBib(bibFile)  #loads entire bib
        bibForHTML = functions.prettifyBib(
            bibDic[citeKey]
            ["complete"])  #makes the bib file look better for this view

        orderedPages = list(
            pageDic.keys())  #list of all keys and pagenummers from page dic

        for o in range(
                0,
                len(orderedPages)):  #long loop that creates every single page
            #print(o)
            k = orderedPages[o]
            v = pageDic[orderedPages[o]]

            pageTemp = template  # take a template
            pageTemp = pageTemp.replace("@PAGELINKS@",
                                        v)  # replace values in template
            pageTemp = pageTemp.replace("@PATHTOFILE@", "")
            pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)

            if k != "DETAILS":  #one page is different than the rest;this for regular
                mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace(
                    "@PAGEFILE@", "%s.png" % k)
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
                pageTemp = pageTemp.replace("@OCREDCONTENT@",
                                            ocred[k].replace("\n", "<br>"))
            else:  # if pages is details.html
                mainElement = bibForHTML.replace("\n", "<br> ")
                mainElement = '<div class="bib">%s</div>' % mainElement  # class for changes in style sheet
                mainElement += '\n<img src="wordcloud.jpg" width="100%" alt="wordcloud">'  #wordcloud we will generate in the next class
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
                pageTemp = pageTemp.replace("@OCREDCONTENT@", "")

            # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@  #links to next and previous page; and when we are on the last it stops
            if k == "DETAILS":
                nextPage = "0001.html"
                prevPage = ""
            elif k == "0001":
                nextPage = "0002.html"
                prevPage = "DETAILS.html"
            elif o == len(orderedPages) - 1:
                nextPage = ""
                prevPage = orderedPages[o - 1] + ".html"
            else:
                nextPage = orderedPages[o + 1] + ".html"
                prevPage = orderedPages[o - 1] + ".html"

            pageTemp = pageTemp.replace("@NEXTPAGEHTML@",
                                        nextPage)  ##find replace in template
            pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage)

            pagePath = os.path.join(pathToBibFile.replace(
                citeKey + ".bib", ""), "pages",
                                    "%s.html" % k)  # saves the actual page
            with open(pagePath, "w", encoding="utf8") as f9:
                f9.write(pageTemp)
示例#29
0
def generatePublicationInterface(citeKey, pathToBibFile):
    print("="*80)
    print(citeKey)

    jsonFile = pathToBibFile.replace(".bib", ".json")
    with open(jsonFile, "r", encoding ="utf8") as jsonData:
        ocred = json.load(jsonData)
        pNums = ocred.keys()
        pageDic = functions.generatePageLinks(pNums)

        # load page template
        with open(settings["template_page"], "r", encoding="utf8") as ft:
            template = ft.read()

        # load individual bib record
        bibFile = pathToBibFile
        bibDic = functions.loadBib(bibFile)
        bibForHTML = functions.prettifyBib(bibDic[citeKey]["complete"])

        orderedPages = list(pageDic.keys())

        for o in range(0, len(orderedPages)):
            #print(o)
            k = orderedPages[o]
            #input(k)
            v = pageDic[orderedPages[o]]

            pageTemp = template
            pageTemp = pageTemp.replace("@PAGELINKS@", v)
            pageTemp = pageTemp.replace("@PATHTOFILE@", "")
            pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)

            emptyResults = '<tr><td><i>%s</i></td><td><i>%s</i></td><td><i>%s</i></td></tr>'

            if k != "DETAILS":
                mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace("@PAGEFILE@", "%s.png" % k)

                pageKey = citeKey+"_%05d" % roundUp(int(k), 5)
                #print(pageKey)
                if pageKey in pageConnData:
                    formattedResults = "\n".join(pageConnData[pageKey])
                    #input(formattedResults)
                else:
                    formattedResults = emptyResults % ("no data", "no data", "no data")

                mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults)
                mainElement += ocrTemplate.replace("@OCREDCONTENTTEMP@", ocred[k].replace("\n", "<br>"))
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
            else:
                reference = generateReferenceSimple(citeKey)
                mainElement = "<h3>%s</h3>\n\n" % reference

                bibElement = '<div class="bib">%s</div>' % bibForHTML.replace("\n", "<br> ")
                bibElement = generalTemplate.replace("@ELEMENTCONTENT@", bibElement)
                bibElement = bibElement.replace("@ELEMENTHEADER@", "BibTeX Bibliographical Record")
                mainElement += bibElement + "\n\n"

                wordCloud = '\n<img src="../' + citeKey + '_wCloud.jpg" width="100%" alt="wordcloud">'
                wordCloud = generalTemplate.replace("@ELEMENTCONTENT@", wordCloud)
                wordCloud = wordCloud.replace("@ELEMENTHEADER@", "WordCloud of Keywords (<i>tf-idf</i>)")
                mainElement += wordCloud + "\n\n"

                if citeKey in publConnData:
                    formattedResults = "\n".join(publConnData[citeKey])
                    #input(formattedResults)
                else:
                    formattedResults = emptyResults % ("no data", "no data", "no data")

                mainElement += connectionsTemplate.replace("@CONNECTEDTEXTSTEMP@", formattedResults)


                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)

            # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@
            if k == "DETAILS":
                nextPage = "0001.html"
                prevPage = ""
            elif k == "0001":
                nextPage = "0002.html"
                prevPage = "DETAILS.html"
            elif o == len(orderedPages)-1:
                nextPage = ""
                prevPage = orderedPages[o-1] + ".html"
            else:
                nextPage = orderedPages[o+1] + ".html"
                prevPage = orderedPages[o-1] + ".html"

            pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage)
            pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage)

            pagePath = os.path.join(pathToBibFile.replace(citeKey+".bib", ""), "pages", "%s.html" % k)
            with open(pagePath, "w", encoding="utf8") as f9:
                f9.write(pageTemp)
示例#30
0
def generatePublicationInterface(citeKey, pathToBibFile):
    print("=" * 80)
    print(citeKey)

    jsonFile = pathToBibFile.replace(".bib", ".json")
    with open(jsonFile) as jsonData:
        ocred = json.load(jsonData)
        pNums = ocred.keys()

        pageDic = functions.generatePageLinks(pNums)

        # load page template
        with open(settings["template_page"], "r",
                  encoding="utf8") as ft:  #add encoding to avoid error
            template = ft.read()

        # load individual bib record
        bibFile = pathToBibFile
        bibDic = functions.loadBib(bibFile)  #loads entire bib
        bibForHTML = functions.prettifyBib(
            bibDic[citeKey]["complete"])  #structures the file

        orderedPages = list(
            pageDic.keys())  # generates list of all the keys and pgnumbers

        for o in range(0, len(orderedPages)):  #loop to create pages
            #print(o)
            k = orderedPages[o]
            v = pageDic[orderedPages[o]]

            pageTemp = template
            pageTemp = pageTemp.replace("@PAGELINKS@", v)  #take a template
            pageTemp = pageTemp.replace("@PATHTOFILE@",
                                        "")  #replace the values
            pageTemp = pageTemp.replace("@CITATIONKEY@", citeKey)

            if k != "DETAILS":  # for regular pages; one page is different from the others
                mainElement = '<img src="@PAGEFILE@" width="100%" alt="">'.replace(
                    "@PAGEFILE@", "%s.png" % k)
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
                pageTemp = pageTemp.replace("@OCREDCONTENT@",
                                            ocred[k].replace("\n", "<br>"))
            else:  # if the page is html
                mainElement = bibForHTML.replace("\n", "<br> ")
                mainElement = '<div class="bib">%s</div>' % mainElement
                mainElement += '\n<img src="wordcloud.jpg" width="100%" alt="wordcloud">'
                pageTemp = pageTemp.replace("@MAINELEMENT@", mainElement)
                pageTemp = pageTemp.replace("@OCREDCONTENT@", "")

            # @NEXTPAGEHTML@ and @PREVIOUSPAGEHTML@ #link to previous/ next pag; stop on last page
            if k == "DETAILS":
                nextPage = "0001.html"
                prevPage = ""
            elif k == "0001":
                nextPage = "0002.html"
                prevPage = "DETAILS.html"
            elif o == len(orderedPages) - 1:
                nextPage = ""
                prevPage = orderedPages[o - 1] + ".html"
            else:
                nextPage = orderedPages[o + 1] + ".html"
                prevPage = orderedPages[o - 1] + ".html"

            pageTemp = pageTemp.replace("@NEXTPAGEHTML@", nextPage)
            pageTemp = pageTemp.replace("@PREVIOUSPAGEHTML@", prevPage)

            pagePath = os.path.join(pathToBibFile.replace(
                citeKey + ".bib", ""), "pages",
                                    "%s.html" % k)  #saves the origin page
            with open(pagePath, "w", encoding="utf8") as f9:
                f9.write(pageTemp)