def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
def select_common_features(trainingset, n):
    ''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
    allwordcounts = dict()

    for avolume in trainingset:
        utils.add_dicts(avolume.rawcounts, allwordcounts)
        # The add_dicts function will add up all the raw counts into
        # a single master dictionary.

    descendingbyfreq = utils.sortkeysbyvalue(allwordcounts,
    # This returns a list of 2-tuple (frequency, word) pairs.

    if n > len(descendingbyfreq):
        n = len(descendingbyfreq)
        print("We only have " + str(n) + " features.")

    # List comprehension that gets the second element of each tuple, up to
    # a total of n tuples.

    topfeatures = [x[1] for x in descendingbyfreq[0:n]]

    return topfeatures
def comparelists(firstmap, secondmap, genremistakes, correctbygenre,
    if len(firstmap) > len(secondmap):
        length = len(secondmap)
    elif len(firstmap) == len(secondmap):
        length = len(firstmap)
            "Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer."

    divergence = 0.0

    for i in range(length):

        generalizedfirst = translate(firstmap[i])
        generalizedsecond = translate(secondmap[i])

        if effectively_equal(generalizedfirst, generalizedsecond):
            utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
            divergence += wordcounts[i]
            utils.addtodict((generalizedsecond, generalizedfirst),
                            wordcounts[i], genremistakes)

    return divergence
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index = allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs)

    return scaleddf
def get_vocabulary_and_counts_4pages(metadata, allIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.

    Adjusted to handle page instances.

    doc_freq = Counter()
    counts = dict()
    id2group = dict()

    for docid in allIDs:

        path = os.path.join(sourcedir,
                            utils.clean_pairtree(docid) + '.basic.json.bz2')
        volume = parser.PagelistFromJson(path, docid)
        pagecounts = volume.get_feature_list()

        for idx, page in enumerate(pagecounts):
            pageid = docid + '||' + str(idx)

            id2group[pageid] = docid

            counts[pageid] = page
            for key, value in page.items():
                doc_freq[key] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts, id2group
def add_to_ficgenre(docid, existingfile, tagas):
    global outfieldnames, metadata
    with open(existingfile, mode = 'a', encoding = 'utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = outfieldnames)
        o = dict()
        j = metadata[docid]
        fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']]
        print(" | ".join(fields))
        o['docid'] = utils.clean_pairtree(j['HTid'])
        o['recordid'] = j['recordid']
        o['oclc'] = j['OCLC']
        o['locnum'] = j['LOCnum']
        o['author'] = j['author']
        o['imprint'] = j['imprint']
        o['date'] = j['date']
        o['firstpub'] = input('First publication date? ')
        o['birthdate'] = input('Author birth year? ')
        o['nationality'] = input('Nationality? ')
        o['gender'] = input('Gender? ')
        o['title'] = j['title']
        o['subjects'] = j['subjects']
        o['enumcron'] = j['enumcron']
        o['genretags'] = tagas
        for key, value in o.items():
            if o[key] == '<blank>':
                o[key] = ''
def get_classvector(classpath, volumeIDs):
    with open(classpath, encoding='utf-8') as f:
        filelines = f.readlines()
    classdict = dict()
    for line in filelines:
        line = line.rstrip()
        fields = line.split('\t')
        volid = utils.clean_pairtree(fields[0])
        theclass = fields[1]
        if theclass == 'elite':
            intclass = 1
        elif theclass == 'vulgar':
            intclass = 0
            intclass = int(theclass)
        classdict[volid] = intclass

    if len(volumeIDs) < 1:
        volumeIDs = [x for x in classdict.keys()]

    classvector = np.zeros(len(volumeIDs))
    for idx, anid in enumerate(volumeIDs):
        if anid in classdict:
            classvector[idx] = classdict[anid]
            print('Missing from class metadata: ' + anid)

    return classvector, volumeIDs
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index=allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index=allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index=allIDs)

    return scaleddf
def choose_cascade(htid):
    '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

    global rowindices, columns, metadata, modelindices, modeldata

    probablydrama = False
    probablypoetry = False
    probablybiography = False
    probablyfiction = False
    maybefiction = False

    htid = utils.pairtreelabel(htid)
    # convert the clean pairtree filename into a dirty pairtree label for metadata matching

    if htid not in rowindices:
        # We have no metadata for this volume.
        print("Volume missing from ExtractedMetadata.tsv: " + htid)

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                probablybiography = True

            if info == "Fiction" or info == "Novel":
                probablyfiction = True

            if (info == "Poetry" or info == "Poems"):
                probablypoetry = True

            if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
                probablydrama = True

    if htid in modelindices:

        title = metadata["title"][htid].lower()
        titlewords = title.split()

        maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid],
                              modeldata["fic"][htid], modeldata["non"][htid],

        if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
            probablypoetry = True

        if maxgenre == 1:
            probablydrama = True

        if maxgenre == 2:
            maybefiction = True

    return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
    global genretranslations
    assert (len(truelist) == len(predicted))

    errorsbygenre = dict()
    correctbygenre = dict()
    accurate = 0
    inaccurate = 0
    totaltruegenre = dict()

    for index, truegenre in enumerate(truelist):
        if truegenre in genretranslations:
            truegenre = genretranslations[truegenre]

        if whethertocountwords:
            increment = wordsperpage[index]
            increment = 1

        utils.addtodict(truegenre, increment, totaltruegenre)

        predictedgenre = predicted[index]

        if genresareequal(truegenre, predictedgenre):
            utils.addtodict(truegenre, increment, correctbygenre)
            accurate += increment
            utils.addtodict((truegenre, predictedgenre), increment,
            inaccurate += increment

    return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
    def addmetadata(self, row, table): = table['author'][row]
        self.title = table['title'][row] = utils.simple_date(row, table)
        genrelist = table['genres'][row].split(';')
        self.genres = set(genrelist)

        varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']

        self.nonmetaflag = False
        for genre in varietiesofnon:
            if genre in self.genres:
                self.nonmetaflag = True
def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse = True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
                runnerup = candidate[0][1]

            return win, dissent, runnerup
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir,
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()
    counts = dict()

    for docid in allIDs:
        counts[docid] = Counter()
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row['feature']
                if len(word) < 1:

                ct = float(row['count'])

                if word.startswith('#header'):
                    word = word.replace('#header', '')
                # This debatable choice treats header words as equivalent
                # to occurrences in the body text. In practice, this seems
                # to slightly improve performance, at least when you're using
                # SVMs and relatively low numbers of features (140-300).
                # Otherwise header words are in practice just discarded, because
                # e.g. #headeract won't be one of the top 250 words.

                doc_freq[word] += 1
                counts[docid][word] += ct

                # # experimental
                # if word.startswith('#'):
                #     squaredfeature = word + 'sqrd'
                #     counts[docid][word] = ct * ct

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts
def get_genrevector(volumeIDs, boundarydef):
    global epindices, nonindices

    n = len(volumeIDs)

    genrevector = np.zeros(n)

    if boundarydef == "nonepistolary / epistolary":

        for idx, volID in enumerate(volumeIDs):
            cleanID = utils.pairtreelabel(volID)

            if cleanID in epindices:
                genrevector[idx] = 1
            elif cleanID in nonindices:
                genrevector[idx] = 0
                print("Error, missing in metadata: " + cleanID)

    return genrevector
def get_metadata_evidence(htid, rowindices, columns, metadata):
	'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

	metadata_evidence = dict()

	metadata_evidence["drama"] = False
	metadata_evidence["poetry"] = False
	metadata_evidence["biography"] = False
	metadata_evidence["fiction"] = False

	htid = utils.pairtreelabel(htid)
	# convert the clean pairtree filename into a dirty pairtree label for metadata matching

	if htid not in rowindices:
		# We have no metadata for this volume.
		return metadata_evidence

		genrestring = metadata["genres"][htid]
		genreinfo = genrestring.split(";")
		# It's a semicolon-delimited list of items.

		for info in genreinfo:

			if info == "Biography" or info == "Autobiography":
				metadata_evidence["biography"] = True

			if info == "Fiction" or info == "Novel":
				metadata_evidence["fiction"] = True

			if (info == "Poetry" or info == "Poems"):
				metadata_evidence["poetry"] = True

			if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
				metadata_evidence["drama"] = True

	return metadata_evidence
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs.

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:

                word = row['feature']
                if word.startswith('#header'):
                    word = word.replace('#header', '')

                doc_freq[word] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab
            wordcountsbyfile[htid] = [count]

    return wordcountsbyfile

# Begin main script.

TOL = 0.1
THRESH = 0.80

genrestocheck = ['fic', 'poe', 'dra']

metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadatapath)

firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/"
secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/"

firstmaps = os.listdir(firstsource)
secondmaps = os.listdir(secondsource)

firstwordcounts = loadwordcounts(firstsource)
secondwordcounts = loadwordcounts(secondsource)

predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/'

predicts = os.listdir(predictsource)
predicts = [x for x in predicts if not x.startswith('.')]
for filename in files2read:
    filepath = os.path.join(root, filename)
    with open(filepath, encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        cols = reader.fieldnames
        for row in reader:
            if row['language'] != 'eng':

            if row['startdate'] is None:
                errors += 1

            inferreddate = utils.date_row(row)
            if inferreddate < 1923 or inferreddate > 2017:

            genres = set(row['genres'].lower().split('|'))
            if 'fiction' not in genres and 'novel' not in genres and 'short stories' not in genres:

            docid = row['docid']
            if docid in icdocs:
                alreadyhad += 1
                row['inferreddate'] = inferreddate
def main(sourcedir, metapath, modeldir, outpath, pairtree = False):
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.

    global allnames, top1000words

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    for name in allnames:
        models[name] = loadamodel(modeldir + name)

    # Now get metadata.

    metadata = get_metadata(metapath)

    predictedgenres = []
    predictedprobs = []
    explanations = []
    wordcounts = []
    englishpcts = []

    c = 0
    for docid in metadata.index:
        c += 1

        if pairtree:
            path = get_pairtree(sourcedir, docid)
            counts, error, wordcount = counts4json(path, docid)
            path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            genredict = make_genredict(metadata, docid)
            englishpct = get_english_percent(counts, top1000words)
            genre, probability, explanation = volume_classification(models, counts, genredict)
            englishpct = 0
            genre = 'NA'
            probability = 0
            explanation = error


    metadata.loc[ : , 'predictedgenre'] = pd.Series(predictedgenres, index = metadata.index)
    metadata.loc[ : , 'probability'] = pd.Series(predictedprobs, index = metadata.index)
    metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index)
    metadata.loc[ : , 'englishpct'] = pd.Series(englishpcts, index = metadata.index)
    metadata.loc[ : , 'explanation'] = pd.Series(explanations, index = metadata.index)

    females = text.split('<arr name="htrc_genderFemale">')
    if len(females) > 1:
        name = females[1].split("</str>")[0]
        name = name.replace("<str>", "")
        names.append((name, "f"))

    return (names)

## We start by loading the list of volumes for which we need a
## Library of Congress Call Number.

import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(

neededoclcs = list()
reversemap = dict()

for idx in rowindices:
    if metadata["LOCnum"][
            idx] == "<blank>" and metadata["OCLC"][idx] != "<blank>":
        oclc = metadata["OCLC"][idx]
        reversemap[oclc] = idx

counter = 0
metacounter = 0
lccndict = dict()
responsedict = dict()
# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

	featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
	featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
	copyfile(featuresource, featuredestination)

	genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
	genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
	with open(genresource, mode="r", encoding = "utf-8") as f:
		filelines = f.readlines()

	with open(genredestination, mode="w", encoding = "utf-8") as f:
		for line in filelines:
			line = line.rstrip()
list_of_dataframes = []
idset = set()

list_of_files = args[1:]
root = '../rawdata/'
list_of_paths = [root + x for x in list_of_files]

for p in list_of_paths:
    df = pd.read_csv(p, index_col='docid')
    idset = idset | set(df.index)

ids = []
for anid in idset:

allpaths = set()
with open('/Volumes/TARDIS/work/ef/htrc-ef-all-files.txt',
          encoding='utf-8') as f:
    for line in f:
        line = line.strip()

missing = set()
found = set()
mapping = dict()
path2id = dict()

#things we already have:
def get_pairtree(pairtreeroot, htid):

    path, postfix = utils.pairtreepath(htid, pairtreeroot)
    wholepath = path + postfix + '/' + postfix + '.json.bz2'

    return wholepath
# refine fiction

import SonicScrewdriver as utils

def passfilter(genrestring):
	fields = genrestring.split(';')
	if "Autobiography" in fields or "Biography" in fields:
		return False
		return True

rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
	filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
	if anid in rows19c:
		genrestring = table19c["genres"][anid]
		rowdict = dict()
		for col in columns19c:
			rowdict[col] = table19c[col][anid]
	elif anid in rows20c:
# a newer metadata set.

import csv
import SonicScrewdriver as utils
import random

selecteddates = dict()
selected = list()

reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1840-1859_200.csv'
with open(reviews, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:

        htid = utils.clean_pairtree(row['HTid'])
        pubdate = int(row['date'])
        firstpub = int(row['firstpub'])
        yrrev = int(row['yrrev'])

        if pubdate > yrrev + 5:
            date = yrrev
            print(str(pubdate) + " => " + str(yrrev))
            date = pubdate

        jgenre = row['Jgenre']

        if jgenre == 'poe':
            selecteddates[htid] = date

import SonicScrewdriver as utils
import csv

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv')

with open('anovaset.txt', encoding = 'utf-8') as f:
    filelines = f.readlines()
    wholeset = [x.rstrip() for x in filelines]

the19c = list()
the20c = list()

for anid in wholeset:
    if anid in rows:

with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the19c:
        f.write(anid + '\n')

with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the20c:
        f.write(anid + '\n')

def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.
    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse=True)
    maxgenre = genretuples[0][1]

    if htid not in rowindices and htid not in modelindices:
        return genresequence, reported

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                couldbefiction = False

            if info == "biog?" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "biog?" and maxgenre != "non":
                reported["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                reported["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                reported["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                reported["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                reported["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                reported["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                reported["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                reported["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                reported["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre == "dra":
                reported["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre != "dra":
                reported["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre == "non":
                reported["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre != "non":
                reported["strongdenial"] = 1

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions,
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            reported["modelagrees"] = 1  ## modelconfidence - nextclosest
            reported["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            reported["modeldisagrees"] = 1
            reported["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
        reported["modelagrees"] = 0
        reported["modeldisagrees"] = 0
        modelprediction = "unknown"

    if not couldbefiction:

        numberofpages = len(genresequence)
        for i in range(numberofpages):
            if genresequence[i] == "fic":
                genresequence[i] = "non"

    return genresequence, reported
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(

modelindices, modelcolumns, modeldata = utils.readtsv(

options = ["non", "bio", "poe", "dra", "fic"]

def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(

sourcedirectory = "/Users/tunder/Dropbox/pagedata/mixedtraining/genremaps/"

dirlist = os.listdir(sourcedirectory)

validnames = list()

for filename in dirlist:
    if not (filename.startswith(".") or filename.startswith("_")):

for filename in validnames:
    filepath = os.path.join(sourcedirectory, filename)

    with open(filepath, mode="r", encoding="utf-8") as f:
        filelines = f.readlines()

    numpages = len(filelines)

    htid = utils.pairtreelabel(filename[0:-4])
    # convert the htid into a dirty pairtree label for metadata matching

    genre = "unknown"
# plotter

import matplotlib.pyplot as plt
import SonicScrewdriver as utils
import pandas as pd
from scipy.stats.stats import pearsonr

indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv")

indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv")

for idx in indices:
	if idx not in indices2:
		print(idx + " is missing.")

makeframe = dict()

makeframe["human-agreement"] = agreement["agreement"]
makeframe["machine-accuracy"] = confidence["accuracy"]

df = pd.DataFrame(makeframe, dtype="float")
df = df.dropna()

print(str(pearsonr(df["human-agreement"], df["machine-accuracy"])))

plt.plot(df["human-agreement"], df["machine-accuracy"], "r.")
plt.xlabel("Human agreement")
plt.ylabel("Machine accuracy")
    reader = csv.reader(f)
    for fields in reader:
        idcode = fields[0]
        date = int(fields[8])
        datedict[idcode] = date

verbose = True

targetwords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}

contexts = []

for filename in filelist:

    htid = utils.pairtreelabel(filename.replace('.txt', ''))

    if htid not in datedict:
        date = datedict[htid]

    filepath = os.path.join(sourcedir, filename)
    with open(filepath, encoding = 'utf-8') as f:
        filelines = f.readlines()
    pagelist = [filelines]

    # The wordcounter module expects a list of pages, each of which is a list of lines.
    # Ebooks have no pages -- at least as I currently receive them -- so we treat it
    # all as one giant page.
def main(sourcedir, metapath, modeldir, outpath, pairtree = False):
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.

    global allnames, top1000words

    alternatesource = '/projects/ichass/usesofscale/post23/englishmonographs1980-2016/'

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    modelpaths = glob.glob(modeldir + '*.p')

    for apath in modelpaths:
        name = apath.replace(modeldir, '')
        name = name.replace('.p', '')
        models[name] = loadamodel(apath)

    # Now get metadata.

    metadata = get_metadata(metapath)

    nonficprobs = []
    juvieprobs = []
    wordcounts = []

    c = 0
    for docid in metadata.index:
        c += 1

        if pairtree:
            path1 = get_pairtree(sourcedir, docid)
            path2 = get_pairtree(alternatesource, docid)

            if os.path.isfile(path1):
                chosenpath = path1
            elif os.path.isfile(path2):
                chosenpath = path2
                print('file not found')
                error = 'file not found'
                wordcount = 0

            counts, error, wordcount = counts4json(chosenpath, docid)

            path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            nonficprob, juvenileprob = volume_classification(models, counts)
            nonficprob = 0.5
            juvenileprob = 0.5


    metadata.loc[ : , 'nonficprob'] = pd.Series(nonficprobs, index = metadata.index)
    metadata.loc[ : , 'juvenileprob'] = pd.Series(juvieprobs, index = metadata.index)
    metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index)

import matplotlib.pyplot as plt
import SonicScrewdriver as utils

targetfile = input('Path to input file? ')

counts = dict()
alltags = set()
alldecades = set()
allcounts = Counter()

with open(targetfile, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = row['date']
        decade = 10 * int(int(date)/10)
        tagset = utils.get_tagset(row['genretags'])
        for tag in tagset:
            if tag == 'chirandom' and ('chiscifi' in tagset):
            if tag not in counts:
                counts[tag] = Counter()

            counts[tag][decade] += 1
            allcounts[decade] += 1

sorted_decades = sorted(list(alldecades))
numdecs = len(sorted_decades)

colors = ['g-', 'b-', 'r-', 'k-', 'ro', 'go', 'bo', 'ko']
import csv
from collections import Counter
import SonicScrewdriver as utils

ficids = set()

meta = dict()

ficsource = "/Volumes/TARDIS/work/fiction/metadata/fiction_metadata.csv"
with open(ficsource, encoding="utf-8") as f:
    reader = csv.DictReader(f)
    fieldnames = reader.fieldnames
    for row in reader:
        htid = row["htid"]
        dirtyhtid = utils.dirty_pairtree(htid)
        meta[dirtyhtid] = row

metasource = "/Volumes/TARDIS/work/metadata/MergedMonographs.tsv"

mysterysubjects = Counter()
scifisubjects = Counter()
gothsubjects = Counter()
gothclues = ["ghost stories", "gothic revival", "horror"]
genretags = dict()

def add_tag(genretags, htid, tagtoadd):
    if htid not in genretags:
        genretags[htid] = set()
modelfolder = "/Volumes/TARDIS/work/moneycontext/"
modelpath = modelfolder + "logisticmodel.p"
with open(modelpath, mode = 'rb') as f:
    logisticmodel = pickle.load(f)

standardizerpath = modelfolder + 'standardizer.p'
with open(standardizerpath, mode = 'rb') as f:
    standardizer = pickle.load(f)

featurepath = modelfolder + 'featurelist.p'
with open(featurepath, mode = 'rb') as f:
    features = pickle.load(f)

# Now load HathiTrust metadata.

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv')

ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}

moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'}

# Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses
# seemed rare enough relative to others that they'd be more likely to introduce noise than to help.
# |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters
# a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very
# good reason.

wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'}

# This is by no means an exhaustive list. Owe, loan, borrowed, etc.
# If we really want to get at the full range of words potentially
# print(roughaccuracy)

with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv",
          encoding="utf-8") as f:
    for key, value in accuracies.items():
        outline = key + "\t" + str(value) + "\n"

metadatapath = os.path.join(firstdir, "predictionMetadata.tsv")
rowindices, columns, metadata = utils.readtsv(metadatapath)

metadatatable['maxprob'] = metadata['maxprob']
metadatatable['gap'] = metadata['gap']
metadatatable['accuracy'] = accuracies
metadatatable['dissent'] = dissentperfile

data = pd.DataFrame(metadatatable, dtype="float")

data['intercept'] = 1.0
train_cols = data.columns[1:]
logit = sm.Logit(data['accuracy'], data[train_cols])
result =
predictions = result.predict(data[train_cols])
print(pearsonr(data['accuracy'], predictions))
    metasource = pd.read_csv(args[1], sep='\t')

    missing = 0

    docstoprocess = metasource.docid

    for idx, docid in enumerate(docstoprocess):

        if idx % 100 == 1:

        if docid in translations:
            docid = translations[docid]

        path, postfix = utils.pairtreepath(docid, '')
        inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
            docid) + '.json.bz2'

        if os.path.isfile(inpath):
        elif 'uc1.b' in docid:
            newdoc = docid.replace('uc1.b', 'uc1.$b')
            path, postfix = utils.pairtreepath(newdoc, '')
            inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
                newdoc) + '.json.bz2'
            if os.path.isfile(inpath):
                translations[docid] = newdoc
                missing += 1
                print(missing, inpath, 'not found.')
                if thisreader not in readerowners[f]:


allfiles = tagset
# This is a list of all the filenames (note, filenames not docids)
# that we found in the /readers sourcedir.

train1 = pd.read_csv('../bzipmeta.csv', dtype = 'object', index_col = 'docid')

tidx = set(train1.index.values)
for filename in allfiles:
    docid = filename.replace('.csv', '')
    if utils.dirty_pairtree(docid) not in tidx:

genrestocheck = ['fic', 'poe']
equivalences = {'non', 'bio', 'other'}

volumesingenre = dict()
for g in genrestocheck:
    volumesingenre[g] = []

alldocids = set()

for filename, owners in readerowners.items():
    path = paths[filename][0]
    if 'metadat' in filename:
# getidstoadd

import SonicScrewdriver as utils
import os

with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f:
    filelines = f.readlines()

ids2get = [x.split('\t')[0] for x in filelines]

fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/')
idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')])

with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f:
    for anid in ids2get:
        if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave:
            f.write(utils.dirty_pairtree(anid) + '\n')