def default(self, *args, **kwargs): #allow a plugin to handle a default url if it wants; it needs to return #a tuple (pagename, [Entry objects]) if it does call_result = run_callback(self.plugins, 'cb_default', args) if call_result != []: return self.render_page(call_result[1:], call_result[0]) try: offset = int(kwargs.get('offset', 0)) except ValueError: offset = 0 z = args[0] l = len(args) if l <= len(self.timeformats): #check to see if args represent a date for fmt in self.timeformats[l - 1]: try: t = time.strptime(' '.join(args), fmt) if "%Y" in fmt: year = t[0] else: year = self.now().year if "%m" in fmt or "%b" in fmt or "%B" in fmt: month = t[1] else: month = None if "%d" in fmt: day = t[2] else: day = None entries = FileCabinet.get_entries_by_date(year, month, day) if entries: entries = entries[offset:offset + config('num_entries')] return self.render_page(entries, ' '.join(args), offset) except ValueError: #not a date - move on pass z = os.path.join(*args) fname = self.stripall(z, '.html', '.htm', '.txt') e = FileCabinet.get_one(fname, self.datadir) if e: return self.render_page([e]) return self.error_page('Page Not Found', 404)
def default(self, *args, **kwargs): #allow a plugin to handle a default url if it wants; it needs to return #a tuple (pagename, [Entry objects]) if it does call_result = run_callback(self.plugins, 'cb_default', args) if call_result != []: return self.render_page(call_result[1:], call_result[0]) try: offset = int(kwargs.get('offset', 0)) except ValueError: offset = 0 z = args[0] l = len(args) if l <= len(self.timeformats): #check to see if args represent a date for fmt in self.timeformats[l-1]: try: t = time.strptime(' '.join(args), fmt) if "%Y" in fmt: year = t[0] else: year = self.now().year if "%m" in fmt or "%b" in fmt or "%B" in fmt: month = t[1] else: month = None if "%d" in fmt: day = t[2] else: day = None entries = FileCabinet.get_entries_by_date(year, month, day) if entries: entries = entries[offset:offset + config('num_entries')] return self.render_page(entries, ' '.join(args), offset) except ValueError: #not a date - move on pass z = os.path.join(*args) fname = self.stripall(z, '.html', '.htm', '.txt') e = FileCabinet.get_one(fname, self.datadir) if e: return self.render_page([e]) return self.error_page('Page Not Found', 404)
def __init__(self): self.timeformats = [["%Y", "%d", "%m", "%b", "%B"], ["%Y %b", "%Y %m", "%Y %b", "%Y %B", "%m %d", "%b %d", "%B %d"], ["%Y %m %d", "%Y %b %d", "%Y %B %d"]] self.plugins = [] #contains all loaded plugins #set the output encoding self._cp_config["cpy.tools.encode.encoding"] = "utf-8" self.now = datetime.datetime.now self.last_update = self.now() self.num_entries = config('num_entries') self.datadir = config('datadir') self.ignore_directories = config('ignore_directories') self.fp = '' #a cache of the front page content self.index() #thus, we don't have to parse the metadata of the front #page article when the second request comes in self.init_plugins(config('plugins')) FileCabinet.get_most_recent(self.datadir) #initialize entries
def __init__(self): self.timeformats = [["%Y", "%d", "%m", "%b", "%B"], [ "%Y %b", "%Y %m", "%Y %b", "%Y %B", "%m %d", "%b %d", "%B %d" ], ["%Y %m %d", "%Y %b %d", "%Y %B %d"]] self.plugins = [] #contains all loaded plugins #set the output encoding self._cp_config["cpy.tools.encode.encoding"] = "utf-8" self.now = datetime.datetime.now self.last_update = self.now() self.num_entries = config('num_entries') self.datadir = config('datadir') self.ignore_directories = config('ignore_directories') self.fp = '' #a cache of the front page content self.index() #thus, we don't have to parse the metadata of the front #page article when the second request comes in self.init_plugins(config('plugins')) FileCabinet.get_most_recent(self.datadir) #initialize entries
def main(): import FileCabinet import NormalizeVolume import sys, os from multiprocessing import Pool import MultiNormalizeProcess args = sys.argv inputfolder = args[1] outputfolder = args[2] if not os.path.isdir(inputfolder): print("Input folder " + inputfolder + " is not a directory.") sys.exit(0) if not os.path.isdir(outputfolder): print("Output folder " + outputfolder + " is not a directory.") sys.exit(0) infiles = os.listdir(inputfolder) already_converted = [x.replace('.tsv', '.txt') for x in os.listdir(outputfolder) if x.endswith('.tsv')] not_yet_converted = set(infiles) - set(already_converted) print("There are " + str(len(not_yet_converted)) + " files still to convert.") inpaths = [os.path.join(inputfolder, x) for x in not_yet_converted if x.endswith('.txt')] outpaths = [os.path.join(outputfolder, x).replace('.txt', '.tsv') for x in not_yet_converted if x.endswith('.txt')] debug = False pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] pathpairs = list(zip(inpaths, outpaths, list(range(len(inpaths))))) pool = Pool(processes = 12) res = pool.map_async(MultiNormalizeProcess.processvolume, pathpairs) res.wait() resultlist = res.get() pool.close() pool.join() os.system('say "your program has finished"')
def files(self, offset): return FileCabinet.get_most_recent(self.datadir, self.num_entries, \ self.ignore_directories, offset)
## We assume the slice name has been passed in as an argument. slicename = sys.argv[1] current_working = os.getcwd() # This is most important when running on the cluster, where files are stored in a pairtree # structure and the only way to know which files we're processing is to list HTIDS in a # "slice" file defining a slice of the collection. # When we're running on a local machine, I usually just group files to be processed in a # directory, and create a list of files to process by listing files in that directory. # However, it's still necessary to have a slicename and slicepath, because these get # used to generate a path for an errorlog and list of long S files. if not testrun: pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/normalize/PathDictionary.txt') if testrun: pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] # only relevant if testrun == True slicepath = pathdictionary['slicepath'] + slicename + '.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' headeroutpath = pathdictionary['slicepath'] + slicename + "headers.txt" # read in special-purpose london phrase list
def process_a_file(file_tuple): global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders thisID, metadata_evidence = file_tuple perfileerrorlog = list() return_dict = dict() return_dict["htid"] = thisID return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0") return_dict["errors"] = [] return_dict["phrasecounts"] = dict() if testrun: cleanID = clean_pairtree(thisID.replace("norm.txt", "")) else: cleanID = clean_pairtree(thisID) if not testrun: filepath, postfix = FileCabinet.pairtreepath(thisID, datapath) filename = filepath + postfix + '/' + postfix + ".zip" else: filename = datapath + thisID # ACTUALLY READ THE FILE. if filename.endswith('.zip'): pagelist, successflag = read_zip(filename) else: pagelist, successflag = read_txt(filename) if successflag == "missing file": print(thisID + " is missing.") perfileerrorlog.append(thisID + '\t' + "missing") return_dict["errors"] = perfileerrorlog return return_dict elif successflag == "pagination error": print(thisID + " has a pagination problem.") perfileerrorlog.append(thisID + '\t' + "paginationerror") return_dict["errors"] = perfileerrorlog return return_dict tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug) if pre_english < 0.6: perfileerrorlog.append(thisID + '\t' + "not english") tokencount = len(tokens) if len(tokens) < 10: print(thisID, "has only tokencount", len(tokens)) perfileerrorlog.append(thisID + '\t' + 'short') correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems. # This algorithm works adequately. errors = 1 truths = 1 # Initialized to 1 as a Laplacian correction. for word in felecterrors: errors = errors + masterdict.get(word, 0) for word in selecttruths: truths = truths + masterdict.get(word, 0) if truths > errors: LongSproblem = False else: LongSproblem = True if LongSproblem == False: corrected = correct_tokens deleted = dict() added = dict() else: deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug) # okay, this is crazy and not efficient to run, but it's easy to write and there are a small number # of these files -- so I'm going to count the new contextually-corrected tokens by re-running them # through Volume. correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug) corrected = correct_tokens # If we are upvoting tokens in the header, they need to be added here. for index, page in enumerate(pages): thispageheader = headerlist[index] header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug) headerdict = header_pages[0] for key, value in headerdict.items(): if key in meaningfulheaders: if key in page: page[key] += 2 # a fixed increment no matter how many times the word occurs in the # header else: page[key] = 2 print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.") # Write corrected file. cleanHTID = clean_pairtree(thisID) if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", "") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".norm.txt") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", "norm.txt") else: outHTID = cleanHTID + ".norm.txt" outfilename = outpath + "texts/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".norm.txt" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: for token in corrected: if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')): token = token + " " file.write(token) if len(pages) != len(pagedata): perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID) totalwordsinvol = 0 if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".pg.tsv") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", ".pg.tsv") else: outHTID = cleanHTID + ".pg.tsv" outfilename = outpath + "pagefeatures/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".pg.tsv" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: if metadata_evidence["biography"]: file.write("-1\t#metaBiography\t0\n") if metadata_evidence["drama"]: file.write("-1\t#metaDrama\t0\n") if metadata_evidence["fiction"]: file.write("-1\t#metaFiction\t0\n") if metadata_evidence["poetry"]: file.write("-1\t#metaPoetry\t0\n") numberofpages = len(pages) for index, page in enumerate(pages): # This is a shameful hack that should be deleted later. if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages: continue if testrun and "untypical" in page and (index +2) > numberofpages: continue otherfeatures = 0 for feature, count in page.items(): if feature in pagevocabset or feature.startswith("#"): outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' # pagenumber, featurename, featurecount file.write(outline) else: otherfeatures += count if not feature.startswith("#"): totalwordsinvol += count # This is because there are structural features like #allcapswords # that should not be counted toward total token count. structural_features = pagedata[index] for feature, count in structural_features.items(): if count > 0 or feature == "#textlines": outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' file.write(outline) if otherfeatures > 0: outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n' file.write(outline) metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english)) return_dict["metadata"] = metatuple return_dict["errors"] = perfileerrorlog return return_dict
## We assume the slice name has been passed in as an argument. slicename = sys.argv[1] current_working = os.getcwd() # This is most important when running on the cluster, where files are stored in a pairtree # structure and the only way to know which files we're processing is to list HTIDS in a # "slice" file defining a slice of the collection. # When we're running on a local machine, I usually just group files to be processed in a # directory, and create a list of files to process by listing files in that directory. # However, it's still necessary to have a slicename and slicepath, because these get # used to generate a path for an errorlog and list of long S files. if not testrun: pathdictionary = FileCabinet.loadpathdictionary( '/home/tunder/python/normalize/PathDictionary.txt') if testrun: pathdictionary = FileCabinet.loadpathdictionary( '/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] # only relevant if testrun == True slicepath = pathdictionary['slicepath'] + slicename + '.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' phrasecountpath = longSpath = pathdictionary[ 'slicepath'] + slicename + 'phrasecount.json'
inpath = '/Volumes/TARDIS/work/fullmeta/litenrichment.tsv' newfic = [] oldfic = [] with open(inpath, encoding = 'utf-8') as f: reader = csv.DictReader(f, delimiter = '\t') fieldnames = reader.fieldnames for row in reader: genre = row['sampledas'] if genre != 'bio': continue # right now we're running on biography authdate = row['authordate'] birth, death = cabinet.parse_authordate(authdate) date = utils.date_row(row) if death > 0 and death < 1920: oldfic.append(row) continue elif death > 0 and death + 20 < date: oldfic.append(row) continue else: stdauthor = standardize_name(row['author']) row['stdauthor'] = stdauthor newfic.append(row) def numeric_only(astring): numonly = '' for character in astring:
logistic = dict() realclass = dict() titles = dict() dates = dict() with open('../metadata/prestigeset.csv', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: logistic[row['volid']] = float(row['logistic']) realclass[row['volid']] = row['prestige'] titles[row['volid']] = row['title'] dates[row['volid']] = int(row['dateused']) sourcedir = '../sourcefiles/' documents = filecab.get_wordcounts(sourcedir, '.tsv', set(logistic)) outrows = [] for docid, doc in documents.items(): if docid not in logistic: continue else: allwords = 1 colorct = 0 for word, count in doc.items(): allwords += count if word in colors: colorct += count
breakselected = str.maketrans(BreakablePunctuation, ' ') ## Translation map that erases most punctuation, including hyphens. Punctuation = '.,():-—;"!?•$%@“”#<>+=/[]*^\'{}_■~\\|«»©&~`£·' mosteraser = str.maketrans('', '', Punctuation) punctuple = ('.', ',', '?', '!', ';') delim = '\t' foundcounter = 0 englishcounter = 0 pagedict = dict() import FileCabinet pathdictionary = FileCabinet.loadpathdictionary() rulepath = pathdictionary['volumerulepath'] romannumerals = set() with open(rulepath + 'romannumerals.txt', encoding='utf-8') as file: filelines = file.readlines() for line in filelines: line = line.rstrip() romannumerals.add(line) lexicon = dict() with open(rulepath + 'MainDictionary.txt', encoding='utf-8') as file: filelines = file.readlines()
def process_a_file(file_tuple): global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders thisID, metadata_evidence = file_tuple perfileerrorlog = list() return_dict = dict() return_dict["htid"] = thisID return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0") return_dict["errors"] = [] return_dict["phrasecounts"] = dict() if testrun: cleanID = clean_pairtree(thisID.replace("norm.txt", "")) else: cleanID = clean_pairtree(thisID) if not testrun: filepath, postfix = FileCabinet.pairtreepath(thisID, datapath) filename = filepath + postfix + '/' + postfix + ".zip" else: filename = datapath + thisID # ACTUALLY READ THE FILE. if filename.endswith('.zip'): pagelist, successflag = read_zip(filename) else: pagelist, successflag = read_txt(filename) if successflag == "missing file": print(thisID + " is missing.") perfileerrorlog.append(thisID + '\t' + "missing") return_dict["errors"] = perfileerrorlog return return_dict elif successflag == "pagination error": print(thisID + " has a pagination problem.") perfileerrorlog.append(thisID + '\t' + "paginationerror") return_dict["errors"] = perfileerrorlog return return_dict elif successflag == "unicode error": print(thisID + " can not be decoded by unicode.") perfileerrorlog.append(thisID + '\t' + "unicode error") return_dict["errors"] = perfileerrorlog return return_dict tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug) if pre_english < 0.6: perfileerrorlog.append(thisID + '\t' + "not english") tokencount = len(tokens) if len(tokens) < 10: print(thisID, "has only tokencount", len(tokens)) perfileerrorlog.append(thisID + '\t' + 'short') correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems. # This algorithm works adequately. errors = 1 truths = 1 # Initialized to 1 as a Laplacian correction. for word in felecterrors: errors = errors + masterdict.get(word, 0) for word in selecttruths: truths = truths + masterdict.get(word, 0) if truths > errors: LongSproblem = False else: LongSproblem = True if LongSproblem == False: corrected = correct_tokens deleted = dict() added = dict() else: deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug) # okay, this is crazy and not efficient to run, but it's easy to write and there are a small number # of these files -- so I'm going to count the new contextually-corrected tokens by re-running them # through Volume. correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug) corrected = correct_tokens # If we are upvoting tokens in the header, they need to be added here. if len(pages) != len(headerlist): print(thisID + " fails a routine check of alignment between pages and headers.") else: for index, page in enumerate(pages): thispageheader = headerlist[index] header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug) headerdict = header_pages[0] for key, value in headerdict.items(): if key in meaningfulheaders: if key in page: page[key] += 2 # a fixed increment no matter how many times the word occurs in the # header else: page[key] = 2 print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.") # Write corrected file. cleanHTID = clean_pairtree(thisID) if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", "") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".norm.txt") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", "norm.txt") else: outHTID = cleanHTID + ".norm.txt" outfilename = outpath + "texts/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".norm.txt" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: for token in corrected: if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')): token = token + " " file.write(token) if len(pages) != len(pagedata): perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID) return_dict["errors"] = perfileerrorlog return return_dict totalwordsinvol = 0 if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".pg.tsv") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", ".pg.tsv") else: outHTID = cleanHTID + ".pg.tsv" outfilename = outpath + "pagefeatures/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".pg.tsv" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: if metadata_evidence["biography"]: file.write("-1\t#metaBiography\t0\n") if metadata_evidence["drama"]: file.write("-1\t#metaDrama\t0\n") if metadata_evidence["fiction"]: file.write("-1\t#metaFiction\t0\n") if metadata_evidence["poetry"]: file.write("-1\t#metaPoetry\t0\n") numberofpages = len(pages) for index, page in enumerate(pages): # This is a shameful hack that should be deleted later. if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages: continue if testrun and "untypical" in page and (index +2) > numberofpages: continue otherfeatures = 0 for feature, count in page.items(): if feature in pagevocabset or feature.startswith("#"): outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' # pagenumber, featurename, featurecount file.write(outline) else: otherfeatures += count if not feature.startswith("#"): totalwordsinvol += count # This is because there are structural features like #allcapswords # that should not be counted toward total token count. structural_features = pagedata[index] for feature, count in structural_features.items(): if count > 0 or feature == "#textlines": outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' file.write(outline) if otherfeatures > 0: outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n' file.write(outline) metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english)) return_dict["metadata"] = metatuple return_dict["errors"] = perfileerrorlog return return_dict
if not os.path.exists(sourcedir + docid + '.tsv'): continue docs.append(row['volid']) logistic.append(float(row['logistic'])) dates.append(float(row['dateused'])) logistic = np.array(logistic) dates = np.array(dates) numdocs = len(docs) categories = dict() for field in fields: categories[field] = np.zeros(numdocs) wordcounts = filecab.get_wordfreqs(sourcedir, '.tsv', docs) for i, doc in enumerate(docs): ctcat = Counter() allcats = 0 for word, count in wordcounts[doc].items(): allcats += count for field in fields: if word in inquirer[field]: ctcat[field] += count for field in fields: categories[field][i] = ctcat[field] / (allcats + 1) logresults = [] dateresults = []
import glob import os import TokenGen import Dictionary import TypeIndex import FileCabinet debug = False batchcount = 100 pathdictionary = FileCabinet.loadpathdictionary() datapath = pathdictionary["datapath"] metadatapath = pathdictionary["metadatapath"] dictionarypath = pathdictionary["dictionarypath"] outputpath = pathdictionary["outputpath"] if os.path.isfile(outputpath + "processed.txt"): with open(outputpath + "processed.txt", encoding="utf-8") as file: lines = file.readlines() startindex = int(lines[-1]) + 1 else: startindex = 0 HTIDfile = metadatapath + "htids.txt" with open(HTIDfile, encoding="utf-8") as file: HTIDlist = file.readlines() if startindex >= len(HTIDlist): print("Finished processing the whole list of volume IDs.") quit()
def main(): import FileCabinet import NormalizeVolume import sys, os from multiprocessing import Pool import MultiNormalizeProcess args = sys.argv inputfolder = args[1] outputfolder = args[2] if not os.path.isdir(inputfolder): print("Input folder " + inputfolder + " is not a directory.") sys.exit(0) if not os.path.isdir(outputfolder): print("Output folder " + outputfolder + " is not a directory.") sys.exit(0) infiles = os.listdir(inputfolder) already_converted = [ x.replace('.tsv', '.txt') for x in os.listdir(outputfolder) if x.endswith('.tsv') ] not_yet_converted = set(infiles) - set(already_converted) print("There are " + str(len(not_yet_converted)) + " files still to convert.") inpaths = [ os.path.join(inputfolder, x) for x in not_yet_converted if x.endswith('.txt') ] outpaths = [ os.path.join(outputfolder, x).replace('.txt', '.tsv') for x in not_yet_converted if x.endswith('.txt') ] debug = False pathdictionary = FileCabinet.loadpathdictionary( '/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] pathpairs = list(zip(inpaths, outpaths, list(range(len(inpaths))))) pool = Pool(processes=12) res = pool.map_async(MultiNormalizeProcess.processvolume, pathpairs) res.wait() resultlist = res.get() pool.close() pool.join() os.system('say "your program has finished"')
import glob import os import TokenGen import Dictionary import TypeIndex import FileCabinet debug = False batchcount = 5000 pathdictionary = FileCabinet.loadpathdictionary() datapath = pathdictionary["datapath"] metadatapath = pathdictionary["metadatapath"] dictionarypath = pathdictionary["dictionarypath"] outputpath = pathdictionary["outputpath"] if os.path.isfile(outputpath + "processed.txt"): with open(outputpath + "processed.txt", encoding="utf-8") as file: lines = file.readlines() startindex = int(lines[-1]) + 1 else: startindex = 0 HTIDfile = metadatapath + "htids.txt" with open(HTIDfile, encoding="utf-8") as file: HTIDlist = file.readlines() if startindex >= len(HTIDlist): print("Finished processing the whole list of volume IDs.") quit()
with open(HTIDfile, encoding="utf-8") as file: HTIDlist = file.readlines() Lexicon = Dictionary.BuildLexicon(dictionarypath, debug) writename = slicename + "IND.txt" delim = '\t' BigIndex = dict() SortedIndex = list() for IDtoprocess in HTIDlist: IDtoprocess = IDtoprocess.strip() filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath) filename = filepath + postfix + '/' + postfix + ".txt" try: with open(filename, encoding='utf-8') as file: lines = file.readlines() successflag = True except IOError as e: successflag = False if not successflag: print(IDtoprocess + " is missing.") continue tokens = TokenGen.keep_hyphens(lines,Lexicon,verbose=debug)
def main(): import FileCabinet import FileUtils import Volume2 import Context import sys import os # DEFINE CONSTANTS. delim = '\t' debug = False felecterrors = ['fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft', 'fat', 'fix', 'chafe', 'loft'] selecttruths = ['see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast', 'sat', 'six', 'chase', 'lost'] # Locate ourselves in the directory structure. cwd = os.getcwd() cwdparent = os.path.abspath(os.path.join(cwd, os.pardir)) # We need to find a directory called 'rulesets,' which we expect to be located # either within the working directory or adjacent to it. if os.path.isdir(os.path.join(cwd, "rulesets")): rulepath = os.path.join(cwd, "rulesets") elif os.path.isdir(os.path.join(cwdparent, "rulesets")): rulepath = os.path.join(cwdparent, "rulesets") else: user = input("Please specify a path to the ruleset directory: ") if os.path.isdir(user): rulepath = user else: print("Invalid path.") sys.exit() # Use rulepath to load relevant rules inside modules. Volume2.importrules(rulepath) Context.importrules(rulepath) # Now we enter dialogue with the user. This is all a little 1982, # but what can I say? Wetware upgrades are expensive. def prompt(promptstring, options): user = input(promptstring) if user not in options: user = prompt(promptstring, options) return user # Ask the user to tell us how to find files to process. print("****************** CorrectOCR 0.1 ******************") print() print("Do you want the full spiel (explanations, caveats, etc.)") user = prompt("y/n : ", ["y", "n"]) if user.lower() == "y": spielpath = os.path.join(cwd, "spiel.txt") with open(spielpath, encoding = 'utf-8') as file: filelines = file.readlines() for line in filelines: print(line, end='') print("\nThis script will correct .txt files, or extract text") print("from zipped archives containing one txt file for each page.") print("In either case it writes the cleaned files back to their") print("original locations with the new suffix '.clean.txt'.") print("\nDo you want to unpack .zip files or .txt files?") user = prompt("zip or txt: ", ["zip", "txt"]) suffix = "." + user suffixlen = len(suffix) print("\nThere are two ways to identify the location of the") print("files to be corrected.") print("\n1. Provide the path to a folder that contains them. I'll") print("recursively search subdirectories of that folder as well. Or,") print("\n2. Provide a file holding a list of pairtree file identifiers,") print("e.g. HathiTrust Volume IDs. I can use those identifiers to infer") print("the paths to the files themselves.\n") user = prompt("Which option do you prefer (1 or 2)? ", ["1", "2"]) if user == "1": rootpath = input("Path to the folder that contains source files: ") filelist = FileUtils.recursivefilegetter(rootpath, suffix) else: print("I expect the pairtree identifiers to be listed one per line,") print("and to be the only item on a line.") filepath = input("Path to the file that contains pairtree identifiers: ") filelist = list() with open(filepath, encoding = 'utf-8') as file: filelines = file.readlines() print("Now I need a path to the folder that contains the pairtree structure.") print("If you have multiple folders for different libraries, this should be") print("the folder above them all. It should end with a slash.") rootpath = input("Path to the folder that contains pairtree: ") for line in filelines: line = line.rstrip() filepath, postfix = FileCabinet.pairtreepath(line, rootpath) filename = filepath + postfix + '/' + postfix + suffix filelist.append(filename) print("\nI identified", len(filelist), "files in that location.") print("\nI can just write clean text files (with suffix clean.txt)") print("or I can also write tab-separated files that count the words") print("in each file after correction.") user = prompt("1) Text only or 2) text-plus-wordcounts? (1 or 2): ", ["1", "2"]) if user == "1": wordcountflag = False else: wordcountflag = True print("Now proceeding to process the files.\n") def subtract_counts (token, adict, tosubtract): '''Adjusts a dictionary by subtracting tosubtract instances of token.''' if token in adict: adict[token] = adict[token] - tosubtract if adict[token] < 0: del adict[token] elif adict[token] < 1: del adict[token] return adict def add_counts (token, adict, toadd): '''Adjusts a dictionary by adding toadd instances of token.''' if token in adict: adict[token] = adict[token] + toadd else: adict[token] = toadd return adict # Here's where we BEGIN THE ACTUAL CORRECTION OF FILES. processedmeta = list() errorlog = list() longSfiles = list() count = 0 for filename in filelist: try: if suffix == ".zip": lines = FileUtils.readzip(filename) successflag = True else: with open(filename, encoding='utf-8') as file: lines = file.readlines() successflag = True except IOError as e: successflag = False if not successflag: print(filename + " is missing.") errorlog.append(filename + '\t' + "missing") continue tokens, pre_matched, pre_english = Volume2.as_stream(lines, verbose=debug) tokencount = len(tokens) if len(tokens) < 10: print(filename, "has only tokencount", len(tokens)) errorlog.append(filename + '\t' + 'short') correct_tokens, pages, post_matched, post_english = Volume2.correct_stream(tokens, verbose = debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems. # This algorithm works adequately. errors = 1 truths = 1 totaladded = 0 totaldeleted = 0 # Initialized to 1 as a Laplacian correction. for word in felecterrors: errors = errors + masterdict.get(word, 0) for word in selecttruths: truths = truths + masterdict.get(word, 0) if truths > errors: LongSproblem = False else: LongSproblem = True if LongSproblem == False: corrected = correct_tokens else: longSfiles.append(filename) deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug) ## Adjust wordcounts to reflect contextual spellchecking. if wordcountflag: for word, count in deleted.items(): masterdict = subtract_counts(word, masterdict, count) totaldeleted = totaldeleted + count for word, count in added.items(): masterdict = add_counts(word, masterdict, count) totaladded = totaladded + count # Write corrected file. outfilename = filename[:-suffixlen] + ".clean.txt" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: lasttoken = "" for token in corrected: if lasttoken == '\n' and (token == '"' or token == "'"): token = token elif token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')): token = token + " " file.write(token) lasttoken = token print(outfilename) ## If we're also writing wordcount files, we need to write the .tsv file. if wordcountflag: outlist = sorted(masterdict.items(), key = lambda x: x[1], reverse = True) outfilename = outfilename[ :-10] + ".vol.tsv" totalwordsinvol = 0 with open(outfilename, mode = 'w', encoding = 'utf-8') as file: for item in outlist: outline = item[0] + delim + str(item[1]) + '\n' file.write(outline) totalwordsinvol += item[1] print(outfilename) metatuple = (outfilename, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english), str(totaladded), str(totaldeleted)) processedmeta.append(metatuple) count += 1 if count > 200: break # END ITERATION ACROSS FILES. # Write the errorlog and list of long S files. errorpath = FileUtils.clearpath(rootpath, "processingerrors.txt") longSpath = FileUtils.clearpath(rootpath, "longSfiles.txt") metapath = FileUtils.clearpath(rootpath, "processing_metadata.tsv") if len(errorlog) > 0: with open(errorpath, mode = 'w', encoding = 'utf-8') as file: for line in errorlog: file.write(line + '\n') print("Writing", errorpath) if len(longSfiles) > 0: with open(longSpath, mode = 'w', encoding = 'utf-8') as file: for line in longSfiles: file.write(line + '\n') print("Writing", longSpath) if len(processedmeta) > 0: with open(metapath, mode = 'w', encoding = 'utf8') as file: file.write('filename\twordinvol\toriginallyindict\toriginallyenglish\tindictpostcorrection\tenglishpostcorrection\taddedbycontextmodule\tdeletedbycontextmodule\n') for atuple in processedmeta: outline = '\t'.join(atuple) + '\n' file.write(outline) print("Writing", metapath)
import FileCabinet import Volume import Context import sys # DEFINE CONSTANTS. delim = '\t' debug = False # LOAD PATHS. slicename = sys.argv[1] outfilename = sys.argv[2] ## We assume the slice name has been passed in as an argument. pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/tokenize/PathDictionary.txt') datapath = pathdictionary['datapath'] slicepath = pathdictionary['slicepath'] + slicename + '.txt' metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['slicepath'] + slicename + 'acc.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' with open(slicepath, encoding="utf-8") as file: HTIDlist = file.readlines() HTIDs = set() for thisID in HTIDlist: thisID = thisID.rstrip()
inpath = '/Volumes/TARDIS/work/fullmeta/litenrichment.tsv' newfic = [] oldfic = [] with open(inpath, encoding='utf-8') as f: reader = csv.DictReader(f, delimiter='\t') fieldnames = reader.fieldnames for row in reader: genre = row['sampledas'] if genre != 'bio': continue # right now we're running on biography authdate = row['authordate'] birth, death = cabinet.parse_authordate(authdate) date = utils.date_row(row) if death > 0 and death < 1920: oldfic.append(row) continue elif death > 0 and death + 20 < date: oldfic.append(row) continue else: stdauthor = standardize_name(row['author']) row['stdauthor'] = stdauthor newfic.append(row) def numeric_only(astring): numonly = ''
import SonicScrewdriver as utils import FileCabinet as filecab # start by loading the hard seeds stanford = set() with open('../lexicons/stanford.csv', encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: if row['class'] == 'hard': stanford.add(row['word']) sourcedir = '../sourcefiles/' pairedpaths = filecab.get_pairedpaths(sourcedir, '.tsv') docids = [x[0] for x in pairedpaths] wordcounts = filecab.get_wordcounts(sourcedir, '.tsv', docids) metapath = '../metadata/allgenremeta.csv' genredict = dict() datedict = dict() with open(metapath, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: date = int(row['firstpub']) genre = row['genretags'] docid = row['docid']
import FileCabinet import Volume import Context import sys # DEFINE CONSTANTS. delim = '\t' debug = False # LOAD PATHS. slicename = sys.argv[1] outfilename = sys.argv[2] ## We assume the slice name has been passed in as an argument. pathdictionary = FileCabinet.loadpathdictionary( '/home/tunder/python/tokenize/PathDictionary.txt') datapath = pathdictionary['datapath'] slicepath = pathdictionary['slicepath'] + slicename + '.txt' metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['slicepath'] + slicename + 'acc.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' with open(slicepath, encoding="utf-8") as file: HTIDlist = file.readlines() HTIDs = set() for thisID in HTIDlist: thisID = thisID.rstrip()
def main(): import FileCabinet import FileUtils import Volume2 import Context import sys import os # DEFINE CONSTANTS. delim = '\t' debug = False felecterrors = [ 'fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft', 'fat', 'fix', 'chafe', 'loft' ] selecttruths = [ 'see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast', 'sat', 'six', 'chase', 'lost' ] # Locate ourselves in the directory structure. cwd = os.getcwd() cwdparent = os.path.abspath(os.path.join(cwd, os.pardir)) # We need to find a directory called 'rulesets,' which we expect to be located # either within the working directory or adjacent to it. if os.path.isdir(os.path.join(cwd, "rulesets")): rulepath = os.path.join(cwd, "rulesets") elif os.path.isdir(os.path.join(cwdparent, "rulesets")): rulepath = os.path.join(cwdparent, "rulesets") else: user = input("Please specify a path to the ruleset directory: ") if os.path.isdir(user): rulepath = user else: print("Invalid path.") sys.exit() # Use rulepath to load relevant rules inside modules. Volume2.importrules(rulepath) Context.importrules(rulepath) # Now we enter dialogue with the user. This is all a little 1982, # but what can I say? Wetware upgrades are expensive. def prompt(promptstring, options): user = input(promptstring) if user not in options: user = prompt(promptstring, options) return user # Ask the user to tell us how to find files to process. print("****************** CorrectOCR 0.1 ******************") print() print("Do you want the full spiel (explanations, caveats, etc.)") user = prompt("y/n : ", ["y", "n"]) if user.lower() == "y": spielpath = os.path.join(cwd, "spiel.txt") with open(spielpath, encoding='utf-8') as file: filelines = file.readlines() for line in filelines: print(line, end='') print("\nThis script will correct .txt files, or extract text") print("from zipped archives containing one txt file for each page.") print("In either case it writes the cleaned files back to their") print("original locations with the new suffix '.clean.txt'.") print("\nDo you want to unpack .zip files or .txt files?") user = prompt("zip or txt: ", ["zip", "txt"]) suffix = "." + user suffixlen = len(suffix) print("\nThere are two ways to identify the location of the") print("files to be corrected.") print("\n1. Provide the path to a folder that contains them. I'll") print("recursively search subdirectories of that folder as well. Or,") print("\n2. Provide a file holding a list of pairtree file identifiers,") print("e.g. HathiTrust Volume IDs. I can use those identifiers to infer") print("the paths to the files themselves.\n") user = prompt("Which option do you prefer (1 or 2)? ", ["1", "2"]) if user == "1": rootpath = input("Path to the folder that contains source files: ") filelist = FileUtils.recursivefilegetter(rootpath, suffix) else: print("I expect the pairtree identifiers to be listed one per line,") print("and to be the only item on a line.") filepath = input( "Path to the file that contains pairtree identifiers: ") filelist = list() with open(filepath, encoding='utf-8') as file: filelines = file.readlines() print( "Now I need a path to the folder that contains the pairtree structure." ) print( "If you have multiple folders for different libraries, this should be" ) print("the folder above them all. It should end with a slash.") rootpath = input("Path to the folder that contains pairtree: ") for line in filelines: line = line.rstrip() filepath, postfix = FileCabinet.pairtreepath(line, rootpath) filename = filepath + postfix + '/' + postfix + suffix filelist.append(filename) print("\nI identified", len(filelist), "files in that location.") print("\nI can just write clean text files (with suffix clean.txt)") print("or I can also write tab-separated files that count the words") print("in each file after correction.") user = prompt("1) Text only or 2) text-plus-wordcounts? (1 or 2): ", ["1", "2"]) if user == "1": wordcountflag = False else: wordcountflag = True print("Now proceeding to process the files.\n") def subtract_counts(token, adict, tosubtract): '''Adjusts a dictionary by subtracting tosubtract instances of token.''' if token in adict: adict[token] = adict[token] - tosubtract if adict[token] < 0: del adict[token] elif adict[token] < 1: del adict[token] return adict def add_counts(token, adict, toadd): '''Adjusts a dictionary by adding toadd instances of token.''' if token in adict: adict[token] = adict[token] + toadd else: adict[token] = toadd return adict # Here's where we BEGIN THE ACTUAL CORRECTION OF FILES. processedmeta = list() errorlog = list() longSfiles = list() count = 0 for filename in filelist: try: if suffix == ".zip": lines = FileUtils.readzip(filename) successflag = True else: with open(filename, encoding='utf-8') as file: lines = file.readlines() successflag = True except IOError as e: successflag = False if not successflag: print(filename + " is missing.") errorlog.append(filename + '\t' + "missing") continue tokens, pre_matched, pre_english = Volume2.as_stream(lines, verbose=debug) tokencount = len(tokens) if len(tokens) < 10: print(filename, "has only tokencount", len(tokens)) errorlog.append(filename + '\t' + 'short') correct_tokens, pages, post_matched, post_english = Volume2.correct_stream( tokens, verbose=debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems. # This algorithm works adequately. errors = 1 truths = 1 totaladded = 0 totaldeleted = 0 # Initialized to 1 as a Laplacian correction. for word in felecterrors: errors = errors + masterdict.get(word, 0) for word in selecttruths: truths = truths + masterdict.get(word, 0) if truths > errors: LongSproblem = False else: LongSproblem = True if LongSproblem == False: corrected = correct_tokens else: longSfiles.append(filename) deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities( correct_tokens, debug) ## Adjust wordcounts to reflect contextual spellchecking. if wordcountflag: for word, count in deleted.items(): masterdict = subtract_counts(word, masterdict, count) totaldeleted = totaldeleted + count for word, count in added.items(): masterdict = add_counts(word, masterdict, count) totaladded = totaladded + count # Write corrected file. outfilename = filename[:-suffixlen] + ".clean.txt" with open(outfilename, mode='w', encoding='utf-8') as file: lasttoken = "" for token in corrected: if lasttoken == '\n' and (token == '"' or token == "'"): token = token elif token != '\n' and token != "“" and not ( token.startswith('<') and token.endswith('>')): token = token + " " file.write(token) lasttoken = token print(outfilename) ## If we're also writing wordcount files, we need to write the .tsv file. if wordcountflag: outlist = sorted(masterdict.items(), key=lambda x: x[1], reverse=True) outfilename = outfilename[:-10] + ".vol.tsv" totalwordsinvol = 0 with open(outfilename, mode='w', encoding='utf-8') as file: for item in outlist: outline = item[0] + delim + str(item[1]) + '\n' file.write(outline) totalwordsinvol += item[1] print(outfilename) metatuple = (outfilename, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english), str(totaladded), str(totaldeleted)) processedmeta.append(metatuple) count += 1 if count > 200: break # END ITERATION ACROSS FILES. # Write the errorlog and list of long S files. errorpath = FileUtils.clearpath(rootpath, "processingerrors.txt") longSpath = FileUtils.clearpath(rootpath, "longSfiles.txt") metapath = FileUtils.clearpath(rootpath, "processing_metadata.tsv") if len(errorlog) > 0: with open(errorpath, mode='w', encoding='utf-8') as file: for line in errorlog: file.write(line + '\n') print("Writing", errorpath) if len(longSfiles) > 0: with open(longSpath, mode='w', encoding='utf-8') as file: for line in longSfiles: file.write(line + '\n') print("Writing", longSpath) if len(processedmeta) > 0: with open(metapath, mode='w', encoding='utf8') as file: file.write( 'filename\twordinvol\toriginallyindict\toriginallyenglish\tindictpostcorrection\tenglishpostcorrection\taddedbycontextmodule\tdeletedbycontextmodule\n' ) for atuple in processedmeta: outline = '\t'.join(atuple) + '\n' file.write(outline) print("Writing", metapath)
# USAGE: # from within this /workflow directory: # python NormalizeOneFile.py file_to_crunch.txt > output.tsv # The paths in NormalizeVolume only work if you do it from # within this directory. import FileCabinet import NormalizeVolume import sys debug = False pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] targetfile = sys.argv[1] with open(targetfile, encoding='utf-8') as f: text = f.readlines() tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=debug) correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) pagecounter = 0 masterdict = dict()
## CONTEXT.py '''Contextual spellchecker. On being imported, it loads rulesets. The function as_stream reduces a file to a tokenstream and tests to see whether this is a long-s file needing correction. (Ideally you should only run it on pre-1830 files that might fall into that category.) Then the function "catch_ambiguities" can be called for a specific file. ''' # IMPORTS. import FileCabinet pathdictionary = FileCabinet.loadpathdictionary() rulepath = pathdictionary['contextrulepath'] # CONSTANTS. delim = '\t' punctuationset = {'.', ',', '?', '!', ';', ')'} # There's a reason why we don't include left paren. See 'catch_ambiguities.' flipslipper = ['flip', 'flips', 'flipped', 'flipping', 'flay', 'flays', 'flayed', "flay'd"] # The triadic problems flip - slip - ship and flay - slay - stay require special treatment. ') felecterrors = ['fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft', 'fat', 'fix', 'chafe', 'loft'] selecttruths = ['see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast', 'sat', 'six', 'chase', 'lost'] # Of course, either set could be valid. But I expect the second to be more common. # The comparison is used as a test.
## We assume the slice name has been passed in as an argument. slicename = sys.argv[1] current_working = os.getcwd() # This is most important when running on the cluster, where files are stored in a pairtree # structure and the only way to know which files we're processing is to list HTIDS in a # "slice" file defining a slice of the collection. # When we're running on a local machine, I usually just group files to be processed in a # directory, and create a list of files to process by listing files in that directory. # However, it's still necessary to have a slicename and slicepath, because these get # used to generate a path for an errorlog and list of long S files. if not testrun: pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/normalize/PathDictionary.txt') if testrun: pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] # only relevant if testrun == True slicepath = pathdictionary['slicepath'] + slicename + '.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' phrasecountpath = longSpath = pathdictionary['slicepath'] + slicename + 'phrasecount.json' headeroutpath = pathdictionary['slicepath'] + slicename + "headers.txt"