from collator3 import collate import filekeeping from zipfile import ZipFile from glob import glob collectiondir = '/Volumes/obelisk/zipped/serials/' HTids_to_process = ['mdp.39015065345954'] count = 0 for HTid in HTids_to_process: print(HTid) ## Change to match scheme for non-simple HTids... path, postfix = filekeeping.pairtreepath(HTid, collectiondir) pagepath = path + postfix + "/" filename = postfix + ".zip" # For each HTid, we get a path in the pairtree structure. # Then we read page files, and concatenate them in a list of pages # where each page is a list of lines. pagelist = [] with ZipFile(pagepath + filename,mode='r') as zipvol: zippages = zipvol.namelist() zippages.sort() del zippages[0] count = 0 for f in zippages:
if header in remove: del pagelist[idx][0] if idx in divplace: page.insert(0,"<div id=\"" + divplace[idx][1] + "\" code=\"" + str(divplace[idx][3]) + "\" wordcount=\"" + str(divplace[idx][2]) + "\">\n") pagelist[divplace[idx][0]].append("</div>\n") return pagelist for HTid in HTids_toprocess: # For each HTid, we get a path in the pairtree structure. # Then we read page files, and concatenate them in a list of pages # where each page is a list of lines. path, postfix = filekeeping.pairtreepath(HTid,pairtree_rootpath) pagepath = path + postfix + "/" + postfix + "/" pagefiles = os.listdir(pagepath) pagelist = [] for f in pagefiles: if f[0] == ".": continue with open(pagepath + f, encoding='utf-8') as file: linelist = file.readlines() pagelist.append(linelist) # We're going to keep pageheaders rigorously aligned with pagelist, # so every page gets a 'header,' even if blank.