def choose_cascade(htid):
    '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

    global rowindices, columns, metadata, modelindices, modeldata

    probablydrama = False
    probablypoetry = False
    probablybiography = False
    probablyfiction = False
    maybefiction = False

    htid = utils.pairtreelabel(htid)
    # convert the clean pairtree filename into a dirty pairtree label for metadata matching

    if htid not in rowindices:
        # We have no metadata for this volume.
        print("Volume missing from ExtractedMetadata.tsv: " + htid)

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                probablybiography = True

            if info == "Fiction" or info == "Novel":
                probablyfiction = True

            if (info == "Poetry" or info == "Poems"):
                probablypoetry = True

            if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
                probablydrama = True

    if htid in modelindices:

        title = metadata["title"][htid].lower()
        titlewords = title.split()

        maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid],
                              modeldata["fic"][htid], modeldata["non"][htid],

        if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
            probablypoetry = True

        if maxgenre == 1:
            probablydrama = True

        if maxgenre == 2:
            maybefiction = True

    return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
def get_genrevector(volumeIDs, boundarydef):
    global epindices, nonindices

    n = len(volumeIDs)

    genrevector = np.zeros(n)

    if boundarydef == "nonepistolary / epistolary":

        for idx, volID in enumerate(volumeIDs):
            cleanID = utils.pairtreelabel(volID)

            if cleanID in epindices:
                genrevector[idx] = 1
            elif cleanID in nonindices:
                genrevector[idx] = 0
                print("Error, missing in metadata: " + cleanID)

    return genrevector
def get_metadata_evidence(htid, rowindices, columns, metadata):
	'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

	metadata_evidence = dict()

	metadata_evidence["drama"] = False
	metadata_evidence["poetry"] = False
	metadata_evidence["biography"] = False
	metadata_evidence["fiction"] = False

	htid = utils.pairtreelabel(htid)
	# convert the clean pairtree filename into a dirty pairtree label for metadata matching

	if htid not in rowindices:
		# We have no metadata for this volume.
		return metadata_evidence

		genrestring = metadata["genres"][htid]
		genreinfo = genrestring.split(";")
		# It's a semicolon-delimited list of items.

		for info in genreinfo:

			if info == "Biography" or info == "Autobiography":
				metadata_evidence["biography"] = True

			if info == "Fiction" or info == "Novel":
				metadata_evidence["fiction"] = True

			if (info == "Poetry" or info == "Poems"):
				metadata_evidence["poetry"] = True

			if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
				metadata_evidence["drama"] = True

	return metadata_evidence
fictionFPs = list()
fictionTNs = list()
fictionFNs = list()
dramaTPs = list()
dramaFPs = list()
dramaTNs = list()
dramaFNs = list()

genrefeatures = dict()
genreprecisions = dict()

modeledvols = dict()

for filename in predicts:
    mapname = filename.replace('.predict', '.map')
    labelid = utils.pairtreelabel(filename.replace('.predict', ''))
    fileid = filename.replace('.predict', '')

    if mapname in firstmaps:
        firstpath = os.path.join(firstsource, mapname)
        if os.path.isfile(firstpath):
            with open(firstpath, encoding = 'utf-8') as f:
                filelines = f.readlines()
                success = True
            wordcounts = firstwordcounts[fileid]
            success = False
    elif mapname in secondmaps:
        secondpath = os.path.join(secondsource, mapname)
        if os.path.isfile(secondpath):
            with open(secondpath, encoding = 'utf-8') as f:
import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

sourcedir = "/Users/tunder/Dropbox/pagedata/newfeatures/oldfeatures/"

dirlist = os.listdir(sourcedir)

htids = list()

ctr = 0
with open("/Users/tunder/Dropbox/pagedata/trainingmeta.tsv", mode="w", encoding="utf-8") as f:
	for filename in dirlist:

	    if len(filename) > 7 and not filename.startswith("."):
	        stripped = filename[:-7]
	        htid = utils.pairtreelabel(stripped)
	        outline = ""
	        for column in columns:
	        	outline = outline + metadata[column][htid] + '\t'
	        f.write(outline + "\n")
def metadata_check(htid, inputsequence):
    global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata
    '''Assesses whether previous metadata tend to deny or confirm the
    thrust of page-level genre predictions. For this purpose we use both
    genre codes extracted from the MARC record and the predictions of a volume-
    level probabilistic model.

    Returns two parameters: 1) a dictionary of "confirmations" that indicate
    whether metadata aligns with page-level predictions in six specific ways.
    2) The "maxgenre" or genre most commonly predicted at the page level.'''

    genresequence = [x for x in inputsequence]
    # make a defensive copy of incoming parameter

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    # The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and
    # weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides.
    # Strongconfirmation and strongdenial are based on strings extracted from other fields that are more
    # specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of
    # page-level predictions with an earlier volume-level model of the corpus.

    confirmations = dict()
    for symptom in symptoms:
        confirmations[symptom] = 0

    genrecounts, maxgenre = sequence_to_counts(genresequence)

    if htid not in rowindices and htid not in modelindices:
        return confirmations

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            # if info == "biog?" and maxgenre == "non":
            #     confirmations["weakconfirmation"] = 1
            # if info == "biog?" and maxgenre != "non":
            #     confirmations["weakdenial"] = 1

            if info == "Not fiction" and maxgenre == "non":
                confirmations["weakconfirmation"] = 1
            if info == "Not fiction" and maxgenre == "fic":
                confirmations["weakdenial"] = 1

            if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
                confirmations["strongconfirmation"] = 1
            if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
                confirmations["strongdenial"] = 1

            if info == "Biography" and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
            if info == "Biography" and maxgenre != "non":
                confirmations["strongdenial"] = 1

            if info == "Autobiography" and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
            if info == "Autobiography" and maxgenre != "non":
                confirmations["strongdenial"] = 1

            if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
                confirmations["strongconfirmation"] = 1
            if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
                confirmations["strongdenial"] = 1

            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre == "dra":
                confirmations["strongconfirmation"] = 1
            if (info == "Drama" or info == "Tragedies"
                    or info == "Comedies") and maxgenre != "dra":
                confirmations["strongdenial"] = 1

            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre == "non":
                confirmations["strongconfirmation"] = 1
                couldbefiction = False
            if (info == "Catalog" or info == "Dictionary"
                    or info == "Bibliographies") and maxgenre != "non":
                confirmations["strongdenial"] = 1

    if htid in modelindices:

        modelpredictions = dict()
        for genre, genrecolumn in modeldata.items():
            if not genre in options:
                # this column is not a genre!
            modelpredictions[genre] = float(genrecolumn[htid])
        predictionlist = utils.sortkeysbyvalue(modelpredictions,
        modelprediction = predictionlist[0][1]
        modelconfidence = predictionlist[0][0]
        nextclosest = predictionlist[1][0]
        # Take the top prediction.

        # For purposes of this routine, treat biography as nonfiction:
        if modelprediction == "bio":
            modelprediction = "non"

        if maxgenre == modelprediction:
            confirmations["modelagrees"] = 1  ## modelconfidence - nextclosest
            confirmations["modeldisagrees"] = 0
        if maxgenre != modelprediction:
            ## divergence = modelconfidence - modelpredictions[maxgenre]
            confirmations["modeldisagrees"] = 1
            confirmations["modelagrees"] = 0
            ## print(maxgenre + " ≠ " + modelprediction)
        confirmations["modelagrees"] = 0
        confirmations["modeldisagrees"] = 0
        modelprediction = "unknown"

    return confirmations
predictions = logitpredict(parameters, data)

# with open("/Volumes/TARDIS/output/models/results.txt", mode ="w") as f:
# 	for idx, prediction in enumerate(predictions):
# 		f.write(str(idx) + '\t' + data.index[idx] + '\t' + str(prediction) + '\n')

# This will also do it more easily:

# with open("/Volumes/TARDIS/output/models/PredictAccuracy.p", mode = "r+b") as f:
# 	model = pickle.load(f)

# otherpredictions = model.predict(data)

import SonicScrewdriver as utils

indices = [utils.pairtreelabel(x) for x in data.index]

decorated = [x for x in zip(predictions, indices)]
sortedpredictions, sortedindices = zip(*decorated)

with open("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv", mode = "r", encoding = "utf-8") as f:
	filelines = f.readlines()

for line in filelines[1:]:
	line = line.rstrip()
	fields = line.split('\t')
	headlessline = '\t'.join(fields[1:])
	linedict[fields[0]] = headlessline
        for anotherreading in listoftuples[1:]:
            readerb = anotherreading[0]
            alreadychecked.append((readerb, readera))
            if readera == readerb or (readera, readerb) in alreadychecked:
                genrelistA = reading[1]
                genrelistB = anotherreading[1]
                divergence = comparelists(genrelistA, genrelistB,
                totaldivergence += divergence
                sanitycheck += 1

    assert graphlinks == sanitycheck

    agreement = (potentialcomparisons - totaldivergence)
    agreementpercent = agreement / potentialcomparisons
    volumepercents[htid] = agreementpercent
    overallcomparisons += potentialcomparisons
    overallagreement += agreement

print("Average human agreement: " + str(overallagreement / overallcomparisons))

with open("/Users/tunder/Dropbox/pagedata/interrater/HumanAgreement.tsv",
          encoding="utf-8") as f:
    for key, value in volumepercents.items():
        outline = utils.pairtreelabel(key) + "\t" + str(value) + "\n"
rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
	filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
	if anid in rows19c:
		genrestring = table19c["genres"][anid]
		rowdict = dict()
		for col in columns19c:
			rowdict[col] = table19c[col][anid]
	elif anid in rows20c:
		genrestring = table20c["genres"][anid]
		rowdict = dict()
		for col in columns20c:
import os, sys
import SonicScrewdriver as utils
import random

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

initialsample = random.sample(rowindices, 2000)

directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures")
existingfiles = list()

for filename in directorylist:
	if filename.startswith(".") or filename.startswith("_"):

	htid = utils.pairtreelabel(filename[0:-7])

counter = 0
toremove = list()
for htid in initialsample:
	if htid in existingfiles:
		counter +=1

print("Found " + str(counter) + " duplicates.")
for htid in toremove:

genresrepresented = set()
for htid in initialsample:
	for reading in listoftuples:
		readera = reading[0]
		predictedgenres = reading[1]

		divergence = comparelists(predictedgenres, truegenres, genremistakes, correctbygenre, wordcounts)
		totaldivergence += divergence

	agreement = (potentialcomparisons - totaldivergence)
	agreementpercent = agreement / potentialcomparisons
	volumepercents[htid] = agreementpercent
	overallcomparisons += potentialcomparisons
	overallagreement += agreement

print("Average human agreement: " + str(overallagreement / overallcomparisons))

with open("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv", mode="w", encoding = "utf-8") as f:
	for key, value in volumepercents.items():
		outline = utils.pairtreelabel(key) + "\t" + str(value) + "\n"

import ConfusionMatrix
ConfusionMatrix.confusion_matrix(correctbygenre, genremistakes)

for filename in predicts:
with open(
        return False
import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(

sourcedir = "/Users/tunder/Dropbox/pagedata/newfeatures/oldfeatures/"

dirlist = os.listdir(sourcedir)

htids = list()

ctr = 0
with open("/Users/tunder/Dropbox/pagedata/trainingmeta.tsv",
          encoding="utf-8") as f:
    for filename in dirlist:

        if len(filename) > 7 and not filename.startswith("."):
            stripped = filename[:-7]
            htid = utils.pairtreelabel(stripped)
            outline = ""
            for column in columns:
                outline = outline + metadata[column][htid] + '\t'
            f.write(outline + "\n")
validnames = list()

for filename in dirlist:
    if not (filename.startswith(".") or filename.startswith("_")):

for filename in validnames:
    filepath = os.path.join(sourcedirectory, filename)

    with open(filepath, mode="r", encoding="utf-8") as f:
        filelines = f.readlines()

    numpages = len(filelines)

    htid = utils.pairtreelabel(filename[0:-4])
    # convert the htid into a dirty pairtree label for metadata matching

    genre = "unknown"

    if htid in rowindices:

        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Not fiction":
                genre = "non"
