def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre.''' genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back" or page == "trv": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] return genrecounts, maxgenre
def select_common_features(trainingset, n): ''' Very simply, selects the top n features in the training set. Not a sophisticated feature-selection strategy, but in many cases it gets the job done. ''' allwordcounts = dict() for avolume in trainingset: utils.add_dicts(avolume.rawcounts, allwordcounts) # The add_dicts function will add up all the raw counts into # a single master dictionary. descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse=True) # This returns a list of 2-tuple (frequency, word) pairs. if n > len(descendingbyfreq): n = len(descendingbyfreq) print("We only have " + str(n) + " features.") # List comprehension that gets the second element of each tuple, up to # a total of n tuples. topfeatures = [x[1] for x in descendingbyfreq[0:n]] return topfeatures
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre. Note that this function cannot return "bio." If biography is the largest genre it returns "non"fiction. It counts bio, but ensures that all votes for bio are also votes for non. ''' genrecounts = dict() for page in genresequence: utils.addtodict(page, 1, genrecounts) if page == 'bio': utils.addtodict('non', 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True) maxgenre = genretuples[0][1] if maxgenre == 'bio': maxgenre = 'non' return genrecounts, maxgenre
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre.''' genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back" or page == "trv": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True) maxgenre = genretuples[0][1] return genrecounts, maxgenre
def select_common_features(trainingset, n): ''' Very simply, selects the top n features in the training set. Not a sophisticated feature-selection strategy, but in many cases it gets the job done. ''' allwordcounts = dict() for avolume in trainingset: utils.add_dicts(avolume.rawcounts, allwordcounts) # The add_dicts function will add up all the raw counts into # a single master dictionary. descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True) # This returns a list of 2-tuple (frequency, word) pairs. if n > len(descendingbyfreq): n = len(descendingbyfreq) print("We only have " + str(n) + " features.") # List comprehension that gets the second element of each tuple, up to # a total of n tuples. topfeatures = [x[1] for x in descendingbyfreq[0 : n]] return topfeatures
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Also reports the largest genre. Note that this function cannot return "bio." If biography is the largest genre it returns "non"fiction. It counts bio, but ensures that all votes for bio are also votes for non. ''' genrecounts = dict() for page in genresequence: utils.addtodict(page, 1, genrecounts) if page == 'bio': utils.addtodict('non', 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] if maxgenre == 'bio': maxgenre = 'non' return genrecounts, maxgenre
def comparelists(firstmap, secondmap, genremistakes, correctbygenre, wordcounts): if len(firstmap) > len(secondmap): length = len(secondmap) elif len(firstmap) == len(secondmap): length = len(firstmap) else: print( "Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer." ) divergence = 0.0 for i in range(length): generalizedfirst = translate(firstmap[i]) generalizedsecond = translate(secondmap[i]) if effectively_equal(generalizedfirst, generalizedsecond): utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre) else: divergence += wordcounts[i] utils.addtodict((generalizedsecond, generalizedfirst), wordcounts[i], genremistakes) return divergence
def sequence_to_counts(genresequence): '''Converts a sequence of page-level predictions to a dictionary of counts reflecting the number of pages assigned to each genre. Note that this version of the function is slightly different from the version in MetadataCascades, in allowing a wider range of genres and not initializing anything to zero.''' genrecounts = dict() for page in genresequence: utils.addtodict(page, 1, genrecounts) return genrecounts
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir): ''' Returns a pandas dataframe with feature counts for all the volumes to be used in this model. ''' df = dict() # We initially construct the data frame as a dictionary of Series. vocabset = set(vocabulary) allIDs = positiveIDs + negativeIDs for v in vocabulary: df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs) for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: feature = row['feature'] if feature.startswith('#header'): feature = feature.replace('#header', '') if feature in vocabset: df[feature].loc[docid] = row['count'] # Now let's refashion the dictionary as an actual dataframe. df = pd.DataFrame(df, index = allIDs) df = df[vocabulary] # This reorders the columns to be in vocab order stdscaler = StandardScaler() scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs) return scaleddf
def get_vocabulary_and_counts_4pages(metadata, allIDs, sourcedir, n): ''' Gets the top n words by docfrequency, but also returns a dictionary of wordcounts so we don't have to read them again from the file when generating a feature dataframe. Adjusted to handle page instances. ''' doc_freq = Counter() counts = dict() id2group = dict() for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.basic.json.bz2') volume = parser.PagelistFromJson(path, docid) pagecounts = volume.get_feature_list() for idx, page in enumerate(pagecounts): pageid = docid + '||' + str(idx) id2group[pageid] = docid counts[pageid] = page for key, value in page.items(): doc_freq[key] += 1 vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab, counts, id2group
def add_to_ficgenre(docid, existingfile, tagas): global outfieldnames, metadata with open(existingfile, mode = 'a', encoding = 'utf-8') as f: writer = csv.DictWriter(f, fieldnames = outfieldnames) o = dict() j = metadata[docid] fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']] print(" | ".join(fields)) o['docid'] = utils.clean_pairtree(j['HTid']) o['recordid'] = j['recordid'] o['oclc'] = j['OCLC'] o['locnum'] = j['LOCnum'] o['author'] = j['author'] o['imprint'] = j['imprint'] o['date'] = j['date'] o['firstpub'] = input('First publication date? ') o['birthdate'] = input('Author birth year? ') o['nationality'] = input('Nationality? ') o['gender'] = input('Gender? ') o['title'] = j['title'] o['subjects'] = j['subjects'] o['enumcron'] = j['enumcron'] o['genretags'] = tagas for key, value in o.items(): if o[key] == '<blank>': o[key] = '' writer.writerow(o) print('Done.')
def get_classvector(classpath, volumeIDs): with open(classpath, encoding='utf-8') as f: filelines = f.readlines() classdict = dict() for line in filelines: line = line.rstrip() fields = line.split('\t') volid = utils.clean_pairtree(fields[0]) theclass = fields[1] if theclass == 'elite': intclass = 1 elif theclass == 'vulgar': intclass = 0 else: intclass = int(theclass) classdict[volid] = intclass if len(volumeIDs) < 1: volumeIDs = [x for x in classdict.keys()] classvector = np.zeros(len(volumeIDs)) for idx, anid in enumerate(volumeIDs): if anid in classdict: classvector[idx] = classdict[anid] else: print('Missing from class metadata: ' + anid) return classvector, volumeIDs
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir): ''' Returns a pandas dataframe with feature counts for all the volumes to be used in this model. ''' df = dict() # We initially construct the data frame as a dictionary of Series. vocabset = set(vocabulary) allIDs = positiveIDs + negativeIDs for v in vocabulary: df[v] = pd.Series(np.zeros(len(allIDs)), index=allIDs) for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: feature = row['feature'] if feature.startswith('#header'): feature = feature.replace('#header', '') if feature in vocabset: df[feature].loc[docid] = row['count'] # Now let's refashion the dictionary as an actual dataframe. df = pd.DataFrame(df, index=allIDs) df = df[vocabulary] # This reorders the columns to be in vocab order stdscaler = StandardScaler() scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index=allIDs) return scaleddf
def get_classvector(classpath, volumeIDs): with open(classpath, encoding = 'utf-8') as f: filelines = f.readlines() classdict = dict() for line in filelines: line = line.rstrip() fields = line.split('\t') volid = utils.clean_pairtree(fields[0]) theclass = fields[1] if theclass == 'elite': intclass = 1 elif theclass == 'vulgar': intclass = 0 else: intclass = int(theclass) classdict[volid] = intclass if len(volumeIDs) < 1: volumeIDs = [x for x in classdict.keys()] classvector = np.zeros(len(volumeIDs)) for idx, anid in enumerate(volumeIDs): if anid in classdict: classvector[idx] = classdict[anid] else: print('Missing from class metadata: ' + anid) return classvector, volumeIDs
def choose_cascade(htid): '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.''' global rowindices, columns, metadata, modelindices, modeldata probablydrama = False probablypoetry = False probablybiography = False probablyfiction = False maybefiction = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. print("Volume missing from ExtractedMetadata.tsv: " + htid) else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": probablybiography = True if info == "Fiction" or info == "Novel": probablyfiction = True if (info == "Poetry" or info == "Poems"): probablypoetry = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): probablydrama = True if htid in modelindices: title = metadata["title"][htid].lower() titlewords = title.split() maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid], modeldata["fic"][htid], modeldata["non"][htid], modeldata["poe"][htid])) if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords: probablypoetry = True if maxgenre == 1: probablydrama = True if maxgenre == 2: maybefiction = True return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords): global genretranslations assert (len(truelist) == len(predicted)) errorsbygenre = dict() correctbygenre = dict() accurate = 0 inaccurate = 0 totaltruegenre = dict() for index, truegenre in enumerate(truelist): if truegenre in genretranslations: truegenre = genretranslations[truegenre] if whethertocountwords: increment = wordsperpage[index] else: increment = 1 utils.addtodict(truegenre, increment, totaltruegenre) predictedgenre = predicted[index] if genresareequal(truegenre, predictedgenre): utils.addtodict(truegenre, increment, correctbygenre) accurate += increment else: utils.addtodict((truegenre, predictedgenre), increment, errorsbygenre) inaccurate += increment return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords): global genretranslations assert(len(truelist) == len(predicted)) errorsbygenre = dict() correctbygenre = dict() accurate = 0 inaccurate = 0 totaltruegenre = dict() for index, truegenre in enumerate(truelist): if truegenre in genretranslations: truegenre = genretranslations[truegenre] if whethertocountwords: increment = wordsperpage[index] else: increment = 1 utils.addtodict(truegenre, increment, totaltruegenre) predictedgenre = predicted[index] if genresareequal(truegenre, predictedgenre): utils.addtodict(truegenre, increment, correctbygenre) accurate += increment else: utils.addtodict((truegenre, predictedgenre), increment, errorsbygenre) inaccurate += increment return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
def comparelists(firstmap, secondmap, genremistakes, correctbygenre, wordcounts): if len(firstmap) > len(secondmap): length = len(secondmap) elif len(firstmap) == len(secondmap): length = len(firstmap) else: print("Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer.") divergence = 0.0 for i in range(length): generalizedfirst = translate(firstmap[i]) generalizedsecond = translate(secondmap[i]) if effectively_equal(generalizedfirst, generalizedsecond): utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre) else: divergence += wordcounts[i] utils.addtodict((generalizedsecond, generalizedfirst), wordcounts[i], genremistakes) return divergence
def addmetadata(self, row, table): self.author = table['author'][row] self.title = table['title'][row] self.date = utils.simple_date(row, table) genrelist = table['genres'][row].split(';') self.genres = set(genrelist) varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches'] self.nonmetaflag = False for genre in varietiesofnon: if genre in self.genres: self.nonmetaflag = True
def resolve_voting(votes, tiebreaker): electorate = len(votes) results = dict() for vote in votes: # if vote == "bio": # vote = "non" utils.addtodict(vote, 1, results) candidate = utils.sortkeysbyvalue(results, whethertoreverse = True) dissent = (electorate - candidate[0][0]) / electorate if len(candidate) < 2: # There is only one candidate. return candidate[0][1], dissent, candidate[0][1] elif candidate[0][0] > candidate[1][0]: # We have a majority. return candidate[0][1], dissent, candidate[1][1] else: # We have a tie. if tiebreaker == candidate[0][1]: print("Tiebreaker " + tiebreaker) return candidate[0][1], dissent, candidate[1][1] elif tiebreaker == candidate[1][1]: print("Tiebreaker " + tiebreaker) return candidate[1][1], dissent, candidate[0][1] else: print("Tie in spite of " + tiebreaker) win = random.choice([candidate[0][1], candidate[1][1]]) if win == candidate[0][1]: runnerup = candidate[1][1] else: runnerup = candidate[0][1] return win, dissent, runnerup
def resolve_voting(votes, tiebreaker): electorate = len(votes) results = dict() for vote in votes: # if vote == "bio": # vote = "non" utils.addtodict(vote, 1, results) candidate = utils.sortkeysbyvalue(results, whethertoreverse=True) dissent = (electorate - candidate[0][0]) / electorate if len(candidate) < 2: # There is only one candidate. return candidate[0][1], dissent, candidate[0][1] elif candidate[0][0] > candidate[1][0]: # We have a majority. return candidate[0][1], dissent, candidate[1][1] else: # We have a tie. if tiebreaker == candidate[0][1]: print("Tiebreaker " + tiebreaker) return candidate[0][1], dissent, candidate[1][1] elif tiebreaker == candidate[1][1]: print("Tiebreaker " + tiebreaker) return candidate[1][1], dissent, candidate[0][1] else: print("Tie in spite of " + tiebreaker) win = random.choice([candidate[0][1], candidate[1][1]]) if win == candidate[0][1]: runnerup = candidate[1][1] else: runnerup = candidate[0][1] return win, dissent, runnerup
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also returns a dictionary of wordcounts so we don't have to read them again from the file when generating a feature dataframe. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() counts = dict() for docid in allIDs: counts[docid] = Counter() path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if len(word) < 1: continue ct = float(row['count']) if word.startswith('#header'): word = word.replace('#header', '') # # This debatable choice treats header words as equivalent # to occurrences in the body text. In practice, this seems # to slightly improve performance, at least when you're using # SVMs and relatively low numbers of features (140-300). # Otherwise header words are in practice just discarded, because # e.g. #headeract won't be one of the top 250 words. doc_freq[word] += 1 counts[docid][word] += ct # # experimental # if word.startswith('#'): # squaredfeature = word + 'sqrd' # counts[docid][word] = ct * ct vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab, counts
def get_genrevector(volumeIDs, boundarydef): global epindices, nonindices n = len(volumeIDs) genrevector = np.zeros(n) if boundarydef == "nonepistolary / epistolary": for idx, volID in enumerate(volumeIDs): cleanID = utils.pairtreelabel(volID) if cleanID in epindices: genrevector[idx] = 1 elif cleanID in nonindices: genrevector[idx] = 0 else: print("Error, missing in metadata: " + cleanID) return genrevector
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also returns a dictionary of wordcounts so we don't have to read them again from the file when generating a feature dataframe. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() counts = dict() for docid in allIDs: counts[docid] = Counter() path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if len(word) < 1: continue ct = float(row['count']) if word.startswith('#header'): word = word.replace('#header', '') # # This debatable choice treats header words as equivalent # to occurrences in the body text. In practice, this seems # to slightly improve performance, at least when you're using # SVMs and relatively low numbers of features (140-300). # Otherwise header words are in practice just discarded, because # e.g. #headeract won't be one of the top 250 words. doc_freq[word] += 1 counts[docid][word] += ct vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab, counts
def get_metadata_evidence(htid, rowindices, columns, metadata): '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.''' metadata_evidence = dict() metadata_evidence["drama"] = False metadata_evidence["poetry"] = False metadata_evidence["biography"] = False metadata_evidence["fiction"] = False htid = utils.pairtreelabel(htid) # convert the clean pairtree filename into a dirty pairtree label for metadata matching if htid not in rowindices: # We have no metadata for this volume. return metadata_evidence else: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": metadata_evidence["biography"] = True if info == "Fiction" or info == "Novel": metadata_evidence["fiction"] = True if (info == "Poetry" or info == "Poems"): metadata_evidence["poetry"] = True if (info == "Drama" or info == "Tragedies" or info == "Comedies"): metadata_evidence["drama"] = True return metadata_evidence
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if word.startswith('#header'): word = word.replace('#header', '') doc_freq[word] += 1 vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if word.startswith('#header'): word = word.replace('#header', '') doc_freq[word] += 1 vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab
wordcountsbyfile[htid].append(count) else: wordcountsbyfile[htid] = [count] return wordcountsbyfile # Begin main script. TOL = 0.1 THRESH = 0.80 genrestocheck = ['fic', 'poe', 'dra'] metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv' rows, columns, table = utils.readtsv(metadatapath) firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/" secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/" firstmaps = os.listdir(firstsource) secondmaps = os.listdir(secondsource) firstwordcounts = loadwordcounts(firstsource) secondwordcounts = loadwordcounts(secondsource) predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/' predicts = os.listdir(predictsource) predicts = [x for x in predicts if not x.startswith('.')]
for filename in files2read: print(filename) filepath = os.path.join(root, filename) with open(filepath, encoding='utf-8') as f: reader = csv.DictReader(f, delimiter='\t') cols = reader.fieldnames for row in reader: if row['language'] != 'eng': continue if row['startdate'] is None: errors += 1 continue inferreddate = utils.date_row(row) if inferreddate < 1923 or inferreddate > 2017: continue genres = set(row['genres'].lower().split('|')) if 'fiction' not in genres and 'novel' not in genres and 'short stories' not in genres: continue docid = row['docid'] if docid in icdocs: alreadyhad += 1 continue else: row['inferreddate'] = inferreddate rowlist.append(row)
def main(sourcedir, metapath, modeldir, outpath, pairtree = False): ''' This function can be called from outside the module; it accepts path information and then iterates through all the files it finds in the metadata at "metapath." If the pairtree flag is True, we assume sourcedir is the root of a pairtree structure. Otherwise we assume it's a flat list. ''' global allnames, top1000words # We're going to store all the models, by name, in a dictionary: models = dict() for name in allnames: models[name] = loadamodel(modeldir + name) # Now get metadata. metadata = get_metadata(metapath) predictedgenres = [] predictedprobs = [] explanations = [] wordcounts = [] englishpcts = [] c = 0 for docid in metadata.index: print(c) c += 1 if pairtree: path = get_pairtree(sourcedir, docid) counts, error, wordcount = counts4json(path, docid) else: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') counts, error, wordcount = counts4file(path) if error == 'success': genredict = make_genredict(metadata, docid) englishpct = get_english_percent(counts, top1000words) genre, probability, explanation = volume_classification(models, counts, genredict) else: englishpct = 0 genre = 'NA' probability = 0 explanation = error predictedgenres.append(genre) predictedprobs.append(probability) explanations.append(explanation) wordcounts.append(wordcount) englishpcts.append(englishpct) metadata.loc[ : , 'predictedgenre'] = pd.Series(predictedgenres, index = metadata.index) metadata.loc[ : , 'probability'] = pd.Series(predictedprobs, index = metadata.index) metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index) metadata.loc[ : , 'englishpct'] = pd.Series(englishpcts, index = metadata.index) metadata.loc[ : , 'explanation'] = pd.Series(explanations, index = metadata.index) metadata.to_csv(outpath)
females = text.split('<arr name="htrc_genderFemale">') if len(females) > 1: name = females[1].split("</str>")[0] name = name.replace("<str>", "") names.append((name, "f")) return (names) ## We start by loading the list of volumes for which we need a ## Library of Congress Call Number. import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") neededoclcs = list() reversemap = dict() for idx in rowindices: if metadata["LOCnum"][ idx] == "<blank>" and metadata["OCLC"][idx] != "<blank>": oclc = metadata["OCLC"][idx] neededoclcs.append(oclc) reversemap[oclc] = idx counter = 0 metacounter = 0 lccndict = dict() responsedict = dict()
def maxkey(dictionary): tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse = True) winner = tuplelist[0][1] # if winner == "bio": # winner = "non" return winner
def maxkey(dictionary): tuplelist = utils.sortkeysbyvalue(dictionary, whethertoreverse=True) winner = tuplelist[0][1] # if winner == "bio": # winner = "non" return winner
# Generate Cotraining Set # This script uses a set of volumes already classified and sorted by a model # in order to generate additional training data for a new model. import SonicScrewdriver as utils from shutil import copyfile indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv") toget = indices[-200:] toget = [utils.pairtreefile(x) for x in toget] genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" for htid in toget: featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv" featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv" copyfile(featuresource, featuredestination) genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict" genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map" with open(genresource, mode="r", encoding = "utf-8") as f: filelines = f.readlines() with open(genredestination, mode="w", encoding = "utf-8") as f: for line in filelines: line = line.rstrip()
list_of_dataframes = [] idset = set() list_of_files = args[1:] root = '../rawdata/' list_of_paths = [root + x for x in list_of_files] for p in list_of_paths: df = pd.read_csv(p, index_col='docid') list_of_dataframes.append(df) idset = idset | set(df.index) ids = [] for anid in idset: ids.append(utils.clean_pairtree(str(anid))) allpaths = set() with open('/Volumes/TARDIS/work/ef/htrc-ef-all-files.txt', encoding='utf-8') as f: for line in f: line = line.strip() allpaths.add(line) missing = set() found = set() mapping = dict() path2id = dict() #things we already have:
def get_pairtree(pairtreeroot, htid): path, postfix = utils.pairtreepath(htid, pairtreeroot) wholepath = path + postfix + '/' + postfix + '.json.bz2' return wholepath
# refine fiction import SonicScrewdriver as utils def passfilter(genrestring): fields = genrestring.split(';') if "Autobiography" in fields or "Biography" in fields: return False else: return True rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv') rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv') with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f: filelines = f.readlines() idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines] filteredrows = list() missing = 0 for anid in idlist: if anid in rows19c: genrestring = table19c["genres"][anid] rowdict = dict() for col in columns19c: rowdict[col] = table19c[col][anid] elif anid in rows20c:
# a newer metadata set. import csv import SonicScrewdriver as utils import random selecteddates = dict() selected = list() reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1840-1859_200.csv' with open(reviews, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = utils.clean_pairtree(row['HTid']) pubdate = int(row['date']) firstpub = int(row['firstpub']) yrrev = int(row['yrrev']) if pubdate > yrrev + 5: date = yrrev print(str(pubdate) + " => " + str(yrrev)) else: date = pubdate jgenre = row['Jgenre'] if jgenre == 'poe': selecteddates[htid] = date selected.append(htid)
# sort_anovaset.py import SonicScrewdriver as utils import csv rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv') with open('anovaset.txt', encoding = 'utf-8') as f: filelines = f.readlines() wholeset = [x.rstrip() for x in filelines] the19c = list() the20c = list() for anid in wholeset: if anid in rows: the19c.append(anid) else: the20c.append(anid) with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f: for anid in the19c: f.write(anid + '\n') with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f: for anid in the20c: f.write(anid + '\n')
def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume. genrecounts = dict() genrecounts['fic'] = 0 genrecounts['poe'] = 0 genrecounts['dra'] = 0 genrecounts['non'] = 0 for page in genresequence: indexas = page # For this purpose, we treat biography and indexes as equivalent to nonfiction. if page == "bio" or page == "index" or page == "back": indexas = "non" utils.addtodict(indexas, 1, genrecounts) # Convert the dictionary of counts into a sorted list, and take the max. genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True) maxgenre = genretuples[0][1] if htid not in rowindices and htid not in modelindices: return genresequence, reported if htid in rowindices: genrestring = metadata["genres"][htid] genreinfo = genrestring.split(";") # It's a semicolon-delimited list of items. for info in genreinfo: if info == "Biography" or info == "Autobiography": couldbefiction = False if info == "biog?" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "biog?" and maxgenre != "non": reported["weakdenial"] = 1 if info == "Not fiction" and maxgenre == "non": reported["weakconfirmation"] = 1 if info == "Not fiction" and maxgenre == "fic": reported["weakdenial"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre == "fic": reported["strongconfirmation"] = 1 if (info == "Fiction" or info == "Novel") and maxgenre != "fic": reported["strongdenial"] = 1 if info == "Biography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Biography" and maxgenre != "non": reported["strongdenial"] = 1 if info == "Autobiography" and maxgenre == "non": reported["strongconfirmation"] = 1 if info == "Autobiography" and maxgenre != "non": reported["strongdenial"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre == "poe": reported["strongconfirmation"] = 1 if (info == "Poetry" or info == "Poems") and maxgenre != "poe": reported["strongdenial"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra": reported["strongconfirmation"] = 1 if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra": reported["strongdenial"] = 1 if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre == "non": reported["strongconfirmation"] = 1 couldbefiction = False if (info == "Catalog" or info == "Dictionary" or info == "Bibliographies") and maxgenre != "non": reported["strongdenial"] = 1 else: print("Skipped.") if htid in modelindices: modelpredictions = dict() for genre, genrecolumn in modeldata.items(): if not genre in options: # this column is not a genre! continue modelpredictions[genre] = float(genrecolumn[htid]) predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse=True) modelprediction = predictionlist[0][1] modelconfidence = predictionlist[0][0] nextclosest = predictionlist[1][0] # Take the top prediction. # For purposes of this routine, treat biography as nonfiction: if modelprediction == "bio": modelprediction = "non" if maxgenre == modelprediction: reported["modelagrees"] = 1 ## modelconfidence - nextclosest reported["modeldisagrees"] = 0 if maxgenre != modelprediction: ## divergence = modelconfidence - modelpredictions[maxgenre] reported["modeldisagrees"] = 1 reported["modelagrees"] = 0 ## print(maxgenre + " ≠ " + modelprediction) else: reported["modelagrees"] = 0 reported["modeldisagrees"] = 0 modelprediction = "unknown" if not couldbefiction: numberofpages = len(genresequence) for i in range(numberofpages): if genresequence[i] == "fic": genresequence[i] = "non" return genresequence, reported
# Uses metadata to help assess degrees import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") modelindices, modelcolumns, modeldata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt") options = ["non", "bio", "poe", "dra", "fic"] def censor(htid, genresequence): htid = utils.pairtreelabel(htid) # convert the htid into a dirty pairtree label for metadata matching # Create a dictionary with entries for all possible conditions, initially set negative. symptoms = [ "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees" ] reported = dict() for symptom in symptoms: reported[symptom] = 0 couldbefiction = True # Now we need to assess the largest genre in this volume.
# CreateStupidPredictions.py import os, sys import SonicScrewdriver as utils rowindices, columns, metadata = utils.readtsv( "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv") sourcedirectory = "/Users/tunder/Dropbox/pagedata/mixedtraining/genremaps/" dirlist = os.listdir(sourcedirectory) validnames = list() for filename in dirlist: if not (filename.startswith(".") or filename.startswith("_")): validnames.append(filename) for filename in validnames: filepath = os.path.join(sourcedirectory, filename) with open(filepath, mode="r", encoding="utf-8") as f: filelines = f.readlines() numpages = len(filelines) htid = utils.pairtreelabel(filename[0:-4]) # convert the htid into a dirty pairtree label for metadata matching genre = "unknown"
# plotter import matplotlib.pyplot as plt import SonicScrewdriver as utils import pandas as pd from scipy.stats.stats import pearsonr indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv") indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv") for idx in indices: if idx not in indices2: print(idx + " is missing.") makeframe = dict() makeframe["human-agreement"] = agreement["agreement"] makeframe["machine-accuracy"] = confidence["accuracy"] df = pd.DataFrame(makeframe, dtype="float") df = df.dropna() print(str(pearsonr(df["human-agreement"], df["machine-accuracy"]))) plt.plot(df["human-agreement"], df["machine-accuracy"], "r.") plt.xlabel("Human agreement") plt.ylabel("Machine accuracy") plt.axis([0,1.02,0,1.02]) plt.show()
reader = csv.reader(f) for fields in reader: idcode = fields[0] date = int(fields[8]) datedict[idcode] = date dateset.add(date) verbose = True targetwords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'} contexts = [] for filename in filelist: htid = utils.pairtreelabel(filename.replace('.txt', '')) if htid not in datedict: print(htid) continue else: date = datedict[htid] filepath = os.path.join(sourcedir, filename) with open(filepath, encoding = 'utf-8') as f: filelines = f.readlines() pagelist = [filelines] # The wordcounter module expects a list of pages, each of which is a list of lines. # Ebooks have no pages -- at least as I currently receive them -- so we treat it # all as one giant page.
def main(sourcedir, metapath, modeldir, outpath, pairtree = False): ''' This function can be called from outside the module; it accepts path information and then iterates through all the files it finds in the metadata at "metapath." If the pairtree flag is True, we assume sourcedir is the root of a pairtree structure. Otherwise we assume it's a flat list. ''' global allnames, top1000words alternatesource = '/projects/ichass/usesofscale/post23/englishmonographs1980-2016/' # We're going to store all the models, by name, in a dictionary: models = dict() modelpaths = glob.glob(modeldir + '*.p') for apath in modelpaths: name = apath.replace(modeldir, '') name = name.replace('.p', '') models[name] = loadamodel(apath) # Now get metadata. metadata = get_metadata(metapath) nonficprobs = [] juvieprobs = [] wordcounts = [] c = 0 for docid in metadata.index: print(c) c += 1 if pairtree: path1 = get_pairtree(sourcedir, docid) path2 = get_pairtree(alternatesource, docid) if os.path.isfile(path1): chosenpath = path1 elif os.path.isfile(path2): chosenpath = path2 else: print(path1) print(path2) print('file not found') error = 'file not found' wordcount = 0 counts, error, wordcount = counts4json(chosenpath, docid) else: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') counts, error, wordcount = counts4file(path) if error == 'success': nonficprob, juvenileprob = volume_classification(models, counts) else: nonficprob = 0.5 juvenileprob = 0.5 nonficprobs.append(nonficprob) juvieprobs.append(juvenileprob) wordcounts.append(wordcount) metadata.loc[ : , 'nonficprob'] = pd.Series(nonficprobs, index = metadata.index) metadata.loc[ : , 'juvenileprob'] = pd.Series(juvieprobs, index = metadata.index) metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index) metadata.to_csv(outpath)
import matplotlib.pyplot as plt import SonicScrewdriver as utils targetfile = input('Path to input file? ') counts = dict() alltags = set() alldecades = set() allcounts = Counter() with open(targetfile, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: date = row['date'] decade = 10 * int(int(date)/10) tagset = utils.get_tagset(row['genretags']) for tag in tagset: if tag == 'chirandom' and ('chiscifi' in tagset): continue if tag not in counts: counts[tag] = Counter() counts[tag][decade] += 1 alltags.add(tag) alldecades.add(decade) allcounts[decade] += 1 sorted_decades = sorted(list(alldecades)) numdecs = len(sorted_decades) colors = ['g-', 'b-', 'r-', 'k-', 'ro', 'go', 'bo', 'ko']
import csv from collections import Counter import SonicScrewdriver as utils ficids = set() meta = dict() ficsource = "/Volumes/TARDIS/work/fiction/metadata/fiction_metadata.csv" with open(ficsource, encoding="utf-8") as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: htid = row["htid"] dirtyhtid = utils.dirty_pairtree(htid) ficids.add(dirtyhtid) meta[dirtyhtid] = row metasource = "/Volumes/TARDIS/work/metadata/MergedMonographs.tsv" mysterysubjects = Counter() scifisubjects = Counter() gothsubjects = Counter() gothclues = ["ghost stories", "gothic revival", "horror"] genretags = dict() def add_tag(genretags, htid, tagtoadd): if htid not in genretags: genretags[htid] = set()
modelfolder = "/Volumes/TARDIS/work/moneycontext/" modelpath = modelfolder + "logisticmodel.p" with open(modelpath, mode = 'rb') as f: logisticmodel = pickle.load(f) standardizerpath = modelfolder + 'standardizer.p' with open(standardizerpath, mode = 'rb') as f: standardizer = pickle.load(f) featurepath = modelfolder + 'featurelist.p' with open(featurepath, mode = 'rb') as f: features = pickle.load(f) # Now load HathiTrust metadata. rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv') ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'} moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'} # Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses # seemed rare enough relative to others that they'd be more likely to introduce noise than to help. # |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters # a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very # good reason. wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'} # This is by no means an exhaustive list. Owe, loan, borrowed, etc. # If we really want to get at the full range of words potentially
# print(roughaccuracy) print("SMOOTHED MICROACCURACY:") print(smoothaccuracy) print("COALESCED MICROACCURACY:") print(coalaccuracy) with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv", mode="w", encoding="utf-8") as f: f.write("htid\taccuracy\n") for key, value in accuracies.items(): outline = key + "\t" + str(value) + "\n" f.write(outline) metadatapath = os.path.join(firstdir, "predictionMetadata.tsv") rowindices, columns, metadata = utils.readtsv(metadatapath) metadatatable['maxprob'] = metadata['maxprob'] metadatatable['gap'] = metadata['gap'] metadatatable['accuracy'] = accuracies metadatatable['dissent'] = dissentperfile data = pd.DataFrame(metadatatable, dtype="float") data['intercept'] = 1.0 train_cols = data.columns[1:] logit = sm.Logit(data['accuracy'], data[train_cols]) result = logit.fit() print(result.summary()) predictions = result.predict(data[train_cols]) print(pearsonr(data['accuracy'], predictions))
metasource = pd.read_csv(args[1], sep='\t') missing = 0 docstoprocess = metasource.docid for idx, docid in enumerate(docstoprocess): if idx % 100 == 1: print(idx) if docid in translations: docid = translations[docid] path, postfix = utils.pairtreepath(docid, '') inpath = rootpath + path + postfix + '/' + utils.clean_pairtree( docid) + '.json.bz2' if os.path.isfile(inpath): pass elif 'uc1.b' in docid: newdoc = docid.replace('uc1.b', 'uc1.$b') path, postfix = utils.pairtreepath(newdoc, '') inpath = rootpath + path + postfix + '/' + utils.clean_pairtree( newdoc) + '.json.bz2' if os.path.isfile(inpath): translations[docid] = newdoc else: missing += 1 print(missing, inpath, 'not found.')
if thisreader not in readerowners[f]: readerowners[f].append(thisreader) paths[f].append(thispath) print(len(tagset)) allfiles = tagset # This is a list of all the filenames (note, filenames not docids) # that we found in the /readers sourcedir. train1 = pd.read_csv('../bzipmeta.csv', dtype = 'object', index_col = 'docid') tidx = set(train1.index.values) for filename in allfiles: docid = filename.replace('.csv', '') if utils.dirty_pairtree(docid) not in tidx: print(docid) genrestocheck = ['fic', 'poe'] equivalences = {'non', 'bio', 'other'} volumesingenre = dict() for g in genrestocheck: volumesingenre[g] = [] alldocids = set() for filename, owners in readerowners.items(): path = paths[filename][0] if 'metadat' in filename: print(filename)
# getidstoadd import SonicScrewdriver as utils import os with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f: filelines = f.readlines() ids2get = [x.split('\t')[0] for x in filelines] fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/') idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')]) with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f: for anid in ids2get: if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave: f.write(utils.dirty_pairtree(anid) + '\n')