def choose_cascade(htid): '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.''' global rowindices, columns, metadata, modelindices, modeldata probablydrama = False probablypoetry = False probablybiography = False probablyfiction = False maybefiction = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. print("Volume missing from ExtractedMetadata.tsv: " + htid) else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": probablybiography = True if info == "Fiction" or info == "Novel": probablyfiction = True if (info == "Poetry" or info == "Poems"): probablypoetry = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): probablydrama = True if htid in modelindices: title = metadata["title"][htid].lower() titlewords = title.split() maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid], modeldata["fic"][htid], modeldata["non"][htid], modeldata["poe"][htid])) if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords: probablypoetry = True if maxgenre == 1: probablydrama = True if maxgenre == 2: maybefiction = True return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
def get_genrevector(volumeIDs, boundarydef): global epindices, nonindices n = len(volumeIDs) genrevector = np.zeros(n) if boundarydef == "nonepistolary / epistolary": for idx, volID in enumerate(volumeIDs): cleanID = utils.pairtreelabel(volID) if cleanID in epindices: genrevector[idx] = 1 elif cleanID in nonindices: genrevector[idx] = 0 else: print("Error, missing in metadata: " + cleanID) return genrevector
def get_metadata_evidence(htid, rowindices, columns, metadata): '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.''' metadata_evidence = dict() metadata_evidence["drama"] = False metadata_evidence["poetry"] = False metadata_evidence["biography"] = False metadata_evidence["fiction"] = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. return metadata_evidence else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": metadata_evidence["biography"] = True if info == "Fiction" or info == "Novel": metadata_evidence["fiction"] = True if (info == "Poetry" or info == "Poems"): metadata_evidence["poetry"] = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): metadata_evidence["drama"] = True return metadata_evidence
fictionFPs = list() fictionTNs = list() fictionFNs = list() dramaTPs = list() dramaFPs = list() dramaTNs = list() dramaFNs = list() genrefeatures = dict() genreprecisions = dict() modeledvols = dict() for filename in predicts: mapname = filename.replace('.predict', '.map') labelid = utils.pairtreelabel(filename.replace('.predict', '')) fileid = filename.replace('.predict', '') if mapname in firstmaps: firstpath = os.path.join(firstsource, mapname) if os.path.isfile(firstpath): with open(firstpath, encoding = 'utf-8') as f: filelines = f.readlines() success = True wordcounts = firstwordcounts[fileid] else: success = False elif mapname in secondmaps: secondpath = os.path.join(secondsource, mapname) if os.path.isfile(secondpath): with open(secondpath, encoding = 'utf-8') as f:
def choose_cascade(htid, pagepredictions): '''Reads metadata about this volume and uses it, combined with the thrust of page-level predictions, to decide what other models, if any, should be used to correct/adjust current predictions. Returns three boolean flags, indicating whether the volume is 1) Mostly drama and poetry. 2) Probably biography. 3) Probably fiction. It's entirely conceivable that more than one of these flags could be true at the same time. In that case no cascade will be applied, because we have inconsistent/untrustworthy evidence.''' global rowindices, columns, metadata genresequence = [x for x in pagepredictions] # Make a defensive copy of current page predictions # Then count genres. genrecounts, maxgenre = sequence_to_counts(genresequence) if genrecounts['fic'] > 0 and genrecounts['fic'] < (len(genresequence) / 3): notfiction = True else: notfiction = False if genrecounts['dra'] > 0 and ( genrecounts['non'] > len(genresequence) / 2 or genrecounts['fic'] > len(genresequence) / 2 or genrecounts['poe'] > len(genresequence) * .9): notdrama = True else: notdrama = False # Use those counts to decide whether the volume is more than 50% drama and/or poetry. if (genrecounts['dra'] + genrecounts['poe']) > (len(genresequence) / 2): mostlydrapoe = True else: mostlydrapoe = False # One other flag will be governed by existing metadata. probablyfiction = False probablybiography = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. print("Volume missing from ExtractedMetadata.tsv: " + htid) else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": probablybiography = True if info == "Fiction" or info == "Novel": probablyfiction = True if (info == "Poetry" or info == "Poems"): mostlydrapoe = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): mostlydrapoe = True title = metadata["title"][htid].lower() titlewords = title.split() if "poems" in titlewords or "ballads" in titlewords or "poetical" in titlewords: mostlydrapoe = True if "comedy" in titlewords or "tragedy" in titlewords or "plays" in titlewords: mostlydrapoe = True return mostlydrapoe, probablybiography, probablyfiction, notdrama, notfiction
import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv") sourcedir = "/Users/tunder/Dropbox/pagedata/newfeatures/oldfeatures/" dirlist = os.listdir(sourcedir) htids = list() ctr = 0 with open("/Users/tunder/Dropbox/pagedata/trainingmeta.tsv", mode="w", encoding="utf-8") as f: for filename in dirlist: if len(filename) > 7 and not filename.startswith("."): stripped = filename[:-7] htid = utils.pairtreelabel(stripped) outline = "" for column in columns: outline = outline + metadata[column][htid] + '\t' f.write(outline + "\n")
def choose_cascade(htid): '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.''' global rowindices, columns, metadata, litlocs, biolocs probablydrama = False probablypoetry = False probablybiography = False probablyfiction = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. print("Volume missing from ExtractedMetadata.tsv: " + htid) else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": probablybiography = True if info == "Fiction" or info == "Novel": probablyfiction = True if (info == "Poetry" or info == "Poems"): probablypoetry = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): probablydrama = True title = metadata["title"][htid].lower() titlewords = title.split() if "poems" in titlewords or "ballads" in titlewords or "poetical" in titlewords: probablypoetry = True loc = metadata["LOCnum"][htid] LC = letterpart(loc) if LC in litlocs: litprob = litlocs[LC] print(LC + " lit: " + str(litprob)) else: litprob = 120 print(LC) if LC in biolocs: bioprob = biolocs[LC] print(LC + " bio: " + str(bioprob)) else: bioprob = 120 print(LC) return probablybiography, probablydrama, probablyfiction, probablypoetry, litprob, bioprob
def metadata_check(htid, inputsequence): global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata '''Assesses whether previous metadata tend to deny or confirm the thrust of page-level genre predictions. For this purpose we use both genre codes extracted from the MARC record and the predictions of a volume- level probabilistic model. Returns two parameters: 1) a dictionary of "confirmations" that indicate whether metadata aligns with page-level predictions in six specific ways. 2) The "maxgenre" or genre most commonly predicted at the page level.''' genresequence = [x for x in inputsequence] # make a defensive copy of incoming parameter htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides. # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of # page-level predictions with an earlier volume-level model of the corpus. confirmations = dict() for symptom in symptoms: confirmations[symptom] = 0 genrecounts, maxgenre = sequence_to_counts(genresequence) if htid not in rowindices and htid not in modelindices: return confirmations if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: # if info == "biog?" and maxgenre == "non": # confirmations["weakconfirmation"] = 1 # if info == "biog?" and maxgenre != "non": # confirmations["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": confirmations["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": confirmations["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": confirmations["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": confirmations["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": confirmations["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": confirmations["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": confirmations["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": confirmations["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": confirmations["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": confirmations["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre == "non": confirmations["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre != "non": confirmations["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse=True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: confirmations["modelagrees"] = 1 ## modelconfidence - nextclosest confirmations["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] confirmations["modeldisagrees"] = 1 confirmations["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: confirmations["modelagrees"] = 0 confirmations["modeldisagrees"] = 0 modelprediction = "unknown" return confirmations
def choose_cascade(htid, pagepredictions): '''Reads metadata about this volume and uses it, combined with the thrust of page-level predictions, to decide what other models, if any, should be used to correct/adjust current predictions. Returns three boolean flags, indicating whether the volume is 1) Mostly drama and poetry. 2) Probably biography. 3) Probably fiction. It's entirely conceivable that more than one of these flags could be true at the same time. In that case no cascade will be applied, because we have inconsistent/untrustworthy evidence.''' global rowindices, columns, metadata genresequence = [x for x in pagepredictions] # Make a defensive copy of current page predictions # Then count genres. genrecounts, maxgenre = sequence_to_counts(genresequence) if genrecounts['fic'] > 0 and genrecounts['fic'] < (len(genresequence) / 3): notfiction = True else: notfiction = False if genrecounts['dra'] > 0 and (genrecounts['non'] > len(genresequence) / 2 or genrecounts['fic'] > len(genresequence) / 2 or genrecounts['poe'] > len(genresequence) * .9): notdrama = True else: notdrama = False # Use those counts to decide whether the volume is more than 50% drama and/or poetry. if (genrecounts['dra'] + genrecounts['poe']) > (len(genresequence) / 2): mostlydrapoe = True else: mostlydrapoe = False # One other flag will be governed by existing metadata. probablyfiction = False probablybiography = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. print("Volume missing from ExtractedMetadata.tsv: " + htid) else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": probablybiography = True if info == "Fiction" or info == "Novel": probablyfiction = True if (info == "Poetry" or info == "Poems"): mostlydrapoe = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): mostlydrapoe = True title = metadata["title"][htid].lower() titlewords = title.split() if "poems" in titlewords or "ballads" in titlewords or "poetical" in titlewords: mostlydrapoe = True if "comedy" in titlewords or "tragedy" in titlewords or "plays" in titlewords: mostlydrapoe = True return mostlydrapoe, probablybiography, probablyfiction, notdrama, notfiction
predictions = logitpredict(parameters, data) # with open("/Volumes/TARDIS/output/models/results.txt", mode ="w") as f: # for idx, prediction in enumerate(predictions): # f.write(str(idx) + '\t' + data.index[idx] + '\t' + str(prediction) + '\n') # This will also do it more easily: # with open("/Volumes/TARDIS/output/models/PredictAccuracy.p", mode = "r+b") as f: # model = pickle.load(f) # otherpredictions = model.predict(data) import SonicScrewdriver as utils indices = [utils.pairtreelabel(x) for x in data.index] decorated = [x for x in zip(predictions, indices)] decorated.sort() sortedpredictions, sortedindices = zip(*decorated) with open("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv", mode = "r", encoding = "utf-8") as f: filelines = f.readlines() linedict=dict() for line in filelines[1:]: line = line.rstrip() fields = line.split('\t') headlessline = '\t'.join(fields[1:]) linedict[fields[0]] = headlessline
for anotherreading in listoftuples[1:]: readerb = anotherreading[0] alreadychecked.append((readerb, readera)) if readera == readerb or (readera, readerb) in alreadychecked: continue else: genrelistA = reading[1] genrelistB = anotherreading[1] divergence = comparelists(genrelistA, genrelistB, genremistakes) totaldivergence += divergence sanitycheck += 1 assert graphlinks == sanitycheck agreement = (potentialcomparisons - totaldivergence) agreementpercent = agreement / potentialcomparisons volumepercents[htid] = agreementpercent overallcomparisons += potentialcomparisons overallagreement += agreement print("Average human agreement: " + str(overallagreement / overallcomparisons)) with open("/Users/tunder/Dropbox/pagedata/interrater/HumanAgreement.tsv", mode="w", encoding="utf-8") as f: f.write("htid\tagreement\n") for key, value in volumepercents.items(): outline = utils.pairtreelabel(key) + "\t" + str(value) + "\n" f.write(outline)
def passfilter(genrestring): fields = genrestring.split(';') if "Autobiography" in fields or "Biography" in fields: return False else: return True rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv') rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv') with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f: filelines = f.readlines() idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines] filteredrows = list() missing = 0 for anid in idlist: if anid in rows19c: genrestring = table19c["genres"][anid] rowdict = dict() for col in columns19c: rowdict[col] = table19c[col][anid] elif anid in rows20c: genrestring = table20c["genres"][anid] rowdict = dict() for col in columns20c:
import os, sys import SonicScrewdriver as utils import random rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") initialsample = random.sample(rowindices, 2000) directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures") existingfiles = list() for filename in directorylist: if filename.startswith(".") or filename.startswith("_"): continue htid = utils.pairtreelabel(filename[0:-7]) existingfiles.append(htid) counter = 0 toremove = list() for htid in initialsample: if htid in existingfiles: counter +=1 toremove.append(htid) print("Found " + str(counter) + " duplicates.") for htid in toremove: initialsample.remove(htid) genresrepresented = set() for htid in initialsample:
def extractgenres(pathtotarfile, rows, columns, table): ''' Given a tarfile containing a bunch of jsons, this goes through all the jsons and identifies the ones that belong in filtered subsets for fiction, drama, and poetry. The cutoff is 95 percent precision, except for poetry, where it's 93.9, because the 95-percent threshold is hard to reach. We also write metadata for all jsons where maxgenre is drama, fiction, or poetry, including those that didn't reach threshold. ''' fiction = list() drama = list() poetry = list() ficmeta = list() drameta = list() poemeta = list() tar = tarfile.open(pathtotarfile, 'r:gz') counter = 0 for tarinfo in tar: counter += 1 if tarinfo.isreg(): # This is the name of a regular file rather than a directory. tardata = tar.extractfile(tarinfo.name) somebytes = tardata.read() astring = somebytes.decode('utf-8', 'strict') jobj = json.loads(astring) meta = jobj['hathi_metadata'] stringdate = meta['inferred_date'] htid = meta['htid'] dirtyhtid = utils.pairtreelabel(htid) filename = htid + '.json' pathparts = tarinfo.name.split('/') if filename != pathparts[1]: print(filename) print('Is anomalous, because not equal to ' + pathparts[1]) try: intdate = int(stringdate) except: intdate = 0 print('Anomalous non-numeric date.') if 'drama' in jobj: dramadata = jobj['drama'] precision = dramadata['dra_precision@prob'] probability = dramadata['prob_dra>80precise'] if precision >= 0.95: drama.append((intdate, filename, astring)) included = True else: included = False if dirtyhtid in rows: drameta.append(make_outrow(htid, dirtyhtid, probability, included, columns, table)) else: print('Missing htid: ' + htid) if 'fiction' in jobj: ficdata = jobj['fiction'] precision = ficdata['fic_precision@prob'] probability = ficdata['prob_fic>80precise'] if precision >= 0.95: fiction.append((intdate, filename, astring)) included = True else: included = False if dirtyhtid in rows: ficmeta.append(make_outrow(htid, dirtyhtid, probability, included, columns, table)) else: print('Missing htid: ' + htid) if 'poetry' in jobj: poedata = jobj['poetry'] precision = poedata['poe_precision@prob'] probability = poedata['prob_poe>80precise'] if precision >= 0.939: poetry.append((intdate, filename, astring)) included = True else: included = False if dirtyhtid in rows: poemeta.append(make_outrow(htid, dirtyhtid, probability, included, columns, table)) tar.close() with open('/Volumes/TARDIS/maps/drama/drama_metadata.csv', mode='a', encoding = 'utf-8') as f: writer = csv.writer(f) for row in drameta: writer.writerow(row) with open('/Volumes/TARDIS/maps/fiction/fiction_metadata.csv', mode='a', encoding = 'utf-8') as f: writer = csv.writer(f) for row in ficmeta: writer.writerow(row) with open('/Volumes/TARDIS/maps/poetry/poetry_metadata.csv', mode='a', encoding = 'utf-8') as f: writer = csv.writer(f) for row in poemeta: writer.writerow(row) return drama, fiction, poetry
# good reason. alltargetwords = moneywords sourcedir = "/Users/tunder/Dropbox/GenreProject/python/piketty2/anova/" filelist = os.listdir(sourcedir) filelist = [x for x in filelist if x.endswith(".txt")] contexts = [] WINDOWRADIUS = 12 ctr = 0 for filename in filelist: htid = utils.pairtreelabel(filename.replace('.norm.txt', '')) if htid not in rows: print(htid + ' MISSING') continue else: date = utils.simple_date(htid, table) filepath = os.path.join(sourcedir, filename) with open(filepath, encoding = 'utf-8') as f: filelines = f.readlines() pagelist = [filelines] tokenstream = tokenizer.makestream(pagelist) newcontexts = tokenizer.extract_snippets(tokenstream, WINDOWRADIUS, alltargetwords)
for reading in listoftuples: readera = reading[0] predictedgenres = reading[1] divergence = comparelists(predictedgenres, truegenres, genremistakes, correctbygenre, wordcounts) totaldivergence += divergence agreement = (potentialcomparisons - totaldivergence) agreementpercent = agreement / potentialcomparisons volumepercents[htid] = agreementpercent overallcomparisons += potentialcomparisons overallagreement += agreement print("Average human agreement: " + str(overallagreement / overallcomparisons)) with open("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv", mode="w", encoding = "utf-8") as f: f.write("htid\tagreement\n") for key, value in volumepercents.items(): outline = utils.pairtreelabel(key) + "\t" + str(value) + "\n" f.write(outline) import ConfusionMatrix ConfusionMatrix.confusion_matrix(correctbygenre, genremistakes)
def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume. genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True) maxgenre = genretuples[0][1] if htid not in rowindices and htid not in modelindices: return genresequence, reported if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": couldbefiction = False if info == "biog?" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "biog?" and maxgenre != "non": reported["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": reported["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": reported["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": reported["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": reported["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": reported["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": reported["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": reported["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": reported["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": reported["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non": reported["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non": reported["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: reported["modelagrees"] = 1 ## modelconfidence - nextclosest reported["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] reported["modeldisagrees"] = 1 reported["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: reported["modelagrees"] = 0 reported["modeldisagrees"] = 0 modelprediction = "unknown" if not couldbefiction: numberofpages = len(genresequence) for i in range(numberofpages): if genresequence[i] == "fic": genresequence[i] = "non" return genresequence, reported
HTIDs = list() for filename in genrefiles: if not filename.endswith(".map"): continue else: parts = filename.split(".map") htid = parts[0] HTIDs.append(htid) # Take the part before the extension as the HTID ficgenre = dict() for htid in HTIDs: dirtyid = utils.pairtreelabel(htid) if dirtyid in rowindices: genrestring = metadata["genres"][dirtyid] genreinfo = genrestring.split(";") if "Fiction" in genreinfo or "Novel" in genreinfo: ficgenre[htid] = True else: ficgenre[htid] = False callno = metadata["LOCnum"][dirtyid] LC = letterpart(callno) if LC in litlocs: litprob = litlocs[LC] print(LC + " lit: " + str(litprob))
fictionFPs = list() fictionTNs = list() fictionFNs = list() dramaTPs = list() dramaFPs = list() dramaTNs = list() dramaFNs = list() genrefeatures = dict() genreprecisions = dict() modeledvols = dict() for filename in predicts: mapname = filename.replace('.predict', '.map') labelid = utils.pairtreelabel(filename.replace('.predict', '')) fileid = filename.replace('.predict', '') if mapname in firstmaps: firstpath = os.path.join(firstsource, mapname) if os.path.isfile(firstpath): with open(firstpath, encoding='utf-8') as f: filelines = f.readlines() success = True wordcounts = firstwordcounts[fileid] else: success = False elif mapname in secondmaps: secondpath = os.path.join(secondsource, mapname) if os.path.isfile(secondpath): with open(secondpath, encoding='utf-8') as f:
# good reason. alltargetwords = moneywords sourcedir = "/Users/tunder/Dropbox/GenreProject/python/piketty2/anova/" filelist = os.listdir(sourcedir) filelist = [x for x in filelist if x.endswith(".txt")] contexts = [] WINDOWRADIUS = 12 ctr = 0 for filename in filelist: htid = utils.pairtreelabel(filename.replace('.norm.txt', '')) if htid not in rows: print(htid + ' MISSING') continue else: date = utils.simple_date(htid, table) filepath = os.path.join(sourcedir, filename) with open(filepath, encoding='utf-8') as f: filelines = f.readlines() pagelist = [filelines] tokenstream = tokenizer.makestream(pagelist) newcontexts = tokenizer.extract_snippets(tokenstream, WINDOWRADIUS,
def metadata_check(htid, inputsequence): global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata '''Assesses whether previous metadata tend to deny or confirm the thrust of page-level genre predictions. For this purpose we use both genre codes extracted from the MARC record and the predictions of a volume- level probabilistic model. Returns two parameters: 1) a dictionary of "confirmations" that indicate whether metadata aligns with page-level predictions in six specific ways. 2) The "maxgenre" or genre most commonly predicted at the page level.''' genresequence = [x for x in inputsequence] # make a defensive copy of incoming parameter htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"] # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides. # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of # page-level predictions with an earlier volume-level model of the corpus. confirmations = dict() for symptom in symptoms: confirmations[symptom] = 0 genrecounts, maxgenre = sequence_to_counts(genresequence) if htid not in rowindices and htid not in modelindices: return confirmations if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: # if info == "biog?" and maxgenre == "non": # confirmations["weakconfirmation"] = 1 # if info == "biog?" and maxgenre != "non": # confirmations["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": confirmations["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": confirmations["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": confirmations["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": confirmations["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": confirmations["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": confirmations["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": confirmations["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": confirmations["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": confirmations["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": confirmations["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": confirmations["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non": confirmations["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non": confirmations["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: confirmations["modelagrees"] = 1 ## modelconfidence - nextclosest confirmations["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] confirmations["modeldisagrees"] = 1 confirmations["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: confirmations["modelagrees"] = 0 confirmations["modeldisagrees"] = 0 modelprediction = "unknown" return confirmations
predictions = logitpredict(parameters, data) # with open("/Volumes/TARDIS/output/models/results.txt", mode ="w") as f: # for idx, prediction in enumerate(predictions): # f.write(str(idx) + '\t' + data.index[idx] + '\t' + str(prediction) + '\n') # This will also do it more easily: # with open("/Volumes/TARDIS/output/models/PredictAccuracy.p", mode = "r+b") as f: # model = pickle.load(f) # otherpredictions = model.predict(data) import SonicScrewdriver as utils indices = [utils.pairtreelabel(x) for x in data.index] decorated = [x for x in zip(predictions, indices)] decorated.sort() sortedpredictions, sortedindices = zip(*decorated) with open( "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv", mode="r", encoding="utf-8") as f: filelines = f.readlines() linedict = dict() for line in filelines[1:]: line = line.rstrip() fields = line.split('\t')
return False else: return True rows19c, columns19c, table19c = utils.readtsv( '/Volumes/TARDIS/work/metadata/19cMetadata.tsv') rows20c, columns20c, table20c = utils.readtsv( '/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv') with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding='utf-8') as f: filelines = f.readlines() idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines] filteredrows = list() missing = 0 for anid in idlist: if anid in rows19c: genrestring = table19c["genres"][anid] rowdict = dict() for col in columns19c: rowdict[col] = table19c[col][anid] elif anid in rows20c: genrestring = table20c["genres"][anid] rowdict = dict() for col in columns20c:
def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume. genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] if htid not in rowindices and htid not in modelindices: return genresequence, reported if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": couldbefiction = False if info == "biog?" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "biog?" and maxgenre != "non": reported["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": reported["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": reported["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": reported["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": reported["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": reported["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": reported["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": reported["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": reported["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": reported["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre == "non": reported["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre != "non": reported["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse=True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: reported["modelagrees"] = 1 ## modelconfidence - nextclosest reported["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] reported["modeldisagrees"] = 1 reported["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: reported["modelagrees"] = 0 reported["modeldisagrees"] = 0 modelprediction = "unknown" if not couldbefiction: numberofpages = len(genresequence) for i in range(numberofpages): if genresequence[i] == "fic": genresequence[i] = "non" return genresequence, reported
# We can perhaps enumerate currency terms intuitively, but not these. alltargetwords = moneywords sourcedir = "/Volumes/TARDIS/work/moneytexts/" filelist = os.listdir(sourcedir) filelist = [x for x in filelist if x.endswith(".txt")] contexts = [] WINDOWRADIUS = 7 ctr = 0 for filename in filelist: htid = utils.pairtreelabel(filename.replace('.fic.txt', '')) if htid not in rows: print(htid) continue else: date = utils.simple_date(htid, table) filepath = os.path.join(sourcedir, filename) with open(filepath, encoding = 'utf-8') as f: filelines = f.readlines() pagelist = [filelines] # The wordcounter module expects a list of pages, each of which is a list of lines. # Ebooks have no pages -- at least as I currently receive them -- so we treat it # all as one giant page.
import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv") sourcedir = "/Users/tunder/Dropbox/pagedata/newfeatures/oldfeatures/" dirlist = os.listdir(sourcedir) htids = list() ctr = 0 with open("/Users/tunder/Dropbox/pagedata/trainingmeta.tsv", mode="w", encoding="utf-8") as f: for filename in dirlist: if len(filename) > 7 and not filename.startswith("."): stripped = filename[:-7] htid = utils.pairtreelabel(stripped) outline = "" for column in columns: outline = outline + metadata[column][htid] + '\t' f.write(outline + "\n")
validnames = list() for filename in dirlist: if not (filename.startswith(".") or filename.startswith("_")): validnames.append(filename) for filename in validnames: filepath = os.path.join(sourcedirectory, filename) with open(filepath, mode="r", encoding="utf-8") as f: filelines = f.readlines() numpages = len(filelines) htid = utils.pairtreelabel(filename[0:-4]) # convert the htid into a dirty pairtree label for metadata matching genre = "unknown" if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Not fiction": genre = "non"
def extractgenres(pathtotarfile, rows, columns, table): ''' Given a tarfile containing a bunch of jsons, this goes through all the jsons and identifies the ones that belong in filtered subsets for fiction, drama, and poetry. The cutoff is 95 percent precision, except for poetry, where it's 93.9, because the 95-percent threshold is hard to reach. We also write metadata for all jsons where maxgenre is drama, fiction, or poetry, including those that didn't reach threshold. ''' fiction = list() drama = list() poetry = list() ficmeta = list() drameta = list() poemeta = list() tar = tarfile.open(pathtotarfile, 'r:gz') counter = 0 for tarinfo in tar: counter += 1 if tarinfo.isreg(): # This is the name of a regular file rather than a directory. tardata = tar.extractfile(tarinfo.name) somebytes = tardata.read() astring = somebytes.decode('utf-8', 'strict') jobj = json.loads(astring) meta = jobj['hathi_metadata'] stringdate = meta['inferred_date'] htid = meta['htid'] dirtyhtid = utils.pairtreelabel(htid) filename = htid + '.json' pathparts = tarinfo.name.split('/') if filename != pathparts[1]: print(filename) print('Is anomalous, because not equal to ' + pathparts[1]) try: intdate = int(stringdate) except: intdate = 0 print('Anomalous non-numeric date.') if 'drama' in jobj: dramadata = jobj['drama'] precision = dramadata['dra_precision@prob'] probability = dramadata['prob_dra>80precise'] if precision >= 0.95: drama.append((intdate, filename, astring)) included = True else: included = False if dirtyhtid in rows: drameta.append( make_outrow(htid, dirtyhtid, probability, included, columns, table)) else: print('Missing htid: ' + htid) if 'fiction' in jobj: ficdata = jobj['fiction'] precision = ficdata['fic_precision@prob'] probability = ficdata['prob_fic>80precise'] if precision >= 0.95: fiction.append((intdate, filename, astring)) included = True else: included = False if dirtyhtid in rows: ficmeta.append( make_outrow(htid, dirtyhtid, probability, included, columns, table)) else: print('Missing htid: ' + htid) if 'poetry' in jobj: poedata = jobj['poetry'] precision = poedata['poe_precision@prob'] probability = poedata['prob_poe>80precise'] if precision >= 0.939: poetry.append((intdate, filename, astring)) included = True else: included = False if dirtyhtid in rows: poemeta.append( make_outrow(htid, dirtyhtid, probability, included, columns, table)) tar.close() with open('/Volumes/TARDIS/maps/drama/drama_metadata.csv', mode='a', encoding='utf-8') as f: writer = csv.writer(f) for row in drameta: writer.writerow(row) with open('/Volumes/TARDIS/maps/fiction/fiction_metadata.csv', mode='a', encoding='utf-8') as f: writer = csv.writer(f) for row in ficmeta: writer.writerow(row) with open('/Volumes/TARDIS/maps/poetry/poetry_metadata.csv', mode='a', encoding='utf-8') as f: writer = csv.writer(f) for row in poemeta: writer.writerow(row) return drama, fiction, poetry