# CreateStupidPredictions.py

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

sourcedirectory = "/Users/tunder/Dropbox/pagedata/mixedtraining/genremaps/"

dirlist = os.listdir(sourcedirectory)

validnames = list()

for filename in dirlist:
    if not (filename.startswith(".") or filename.startswith("_")):
        validnames.append(filename)

for filename in validnames:
    filepath = os.path.join(sourcedirectory, filename)

    with open(filepath, mode="r", encoding="utf-8") as f:
        filelines = f.readlines()

    numpages = len(filelines)

    htid = utils.pairtreelabel(filename[0:-4])
    # convert the htid into a dirty pairtree label for metadata matching

    genre = "unknown"
Пример #2
0
# refine fiction

import SonicScrewdriver as utils

def passfilter(genrestring):
	fields = genrestring.split(';')
	if "Autobiography" in fields or "Biography" in fields:
		return False
	else:
		return True

rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
	filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
	if anid in rows19c:
		genrestring = table19c["genres"][anid]
		rowdict = dict()
		for col in columns19c:
			rowdict[col] = table19c[col][anid]
	elif anid in rows20c:
Пример #3
0
            wordcountsbyfile[htid].append(count)
        else:
            wordcountsbyfile[htid] = [count]

    return wordcountsbyfile


# Begin main script.

TOL = 0.1
THRESH = 0.80

genrestocheck = ['fic', 'poe', 'dra']

metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadatapath)

firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/"
secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/"

firstmaps = os.listdir(firstsource)
secondmaps = os.listdir(secondsource)

firstwordcounts = loadwordcounts(firstsource)
secondwordcounts = loadwordcounts(secondsource)

predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/'

predicts = os.listdir(predictsource)
predicts = [x for x in predicts if not x.startswith('.')]
# plotter

import matplotlib.pyplot as plt
import SonicScrewdriver as utils
import pandas as pd
from scipy.stats.stats import pearsonr

indices, columns, agreement = utils.readtsv(
    "/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv")

indices2, columns2, confidence = utils.readtsv(
    "/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv")

for idx in indices:
    if idx not in indices2:
        print(idx + " is missing.")

makeframe = dict()

makeframe["human-agreement"] = agreement["agreement"]
makeframe["machine-accuracy"] = confidence["accuracy"]

df = pd.DataFrame(makeframe, dtype="float")
df = df.dropna()

print(str(pearsonr(df["human-agreement"], df["machine-accuracy"])))

plt.plot(df["human-agreement"], df["machine-accuracy"], "r.")
plt.xlabel("Human agreement")
plt.ylabel("Machine accuracy")
plt.axis([0, 1.02, 0, 1.02])
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

modelindices, modelcolumns, modeldata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")

options = ["non", "bio", "poe", "dra", "fic"]


def censor(htid, genresequence):

    htid = utils.pairtreelabel(htid)
    # convert the htid into a dirty pairtree label for metadata matching

    # Create a dictionary with entries for all possible conditions, initially set negative.
    symptoms = [
        "weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial",
        "modelagrees", "modeldisagrees"
    ]
    reported = dict()
    for symptom in symptoms:
        reported[symptom] = 0

    couldbefiction = True

    # Now we need to assess the largest genre in this volume.
Пример #6
0
def main():
	global testrun, datapath, slicepath, metadatapath, current_working,  metaoutpath, errorpath, pagevocabset

	if testrun:
		filelist = os.listdir(datapath)
		HTIDs = set()
		for afilename in filelist:
			if not (afilename.startswith(".") or afilename.startswith("_")):
				HTIDs.add(afilename)

	else:
		with open(slicepath, encoding="utf-8") as file:
			HTIDlist = file.readlines()

		HTIDs = set([x.rstrip() for x in HTIDlist])
		del HTIDlist

	## discard bad volume IDs

	with open(metadatapath + "badIDs.txt", encoding = 'utf-8') as file:
		filelines = file.readlines()

	for line in filelines:
		line = line.rstrip()
		line = line.split(delim)
		if line[0] in HTIDs:
			HTIDs.discard(line[0])

	if not os.path.isfile(metaoutpath):
		with open(metaoutpath, 'w', encoding = 'utf-8') as f:
			f.write("volID\ttotalwords\tprematched\tpreenglish\tpostmatched\tpostenglish\n")

	print(len(HTIDs))

	# Let's get some metadata to create metadata features.

	if testrun:
		rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
	else:
		rowindices, columns, metadata = utils.readtsv("/projects/ichass/usesofscale/hathimeta/ExtractedMetadata.tsv")

	metadata_clues = list()
	for aHTID in HTIDs:
		evidence = get_metadata_evidence(aHTID, rowindices, columns, metadata)
		metadata_clues.append(evidence)

	assert len(HTIDs) == len(metadata_clues)
	file_tuples = zip(HTIDs, metadata_clues)

	pool = Pool(processes = 12)
	res = pool.map_async(process_a_file, file_tuples)

	# After all files are processed, write metadata, errorlog, and counts of phrases.
	res.wait()
	resultlist = res.get()

	processedmeta = list()
	errorlog = list()
	phrasecount = dict()

	for file_dict in resultlist:
		processedmeta.append(file_dict["metadata"])
		errorlog.extend(file_dict["errors"])
		htid = file_dict["htid"]

	# Metadata.

	with open(metaoutpath, mode = 'a', encoding = 'utf-8') as file:
		for metatuple in processedmeta:
			outlist = [x for x in metatuple]
			outline = delim.join(outlist) + '\n'
			file.write(outline)

	# Write the errorlog.

	if len(errorlog) > 0:
		with open(errorpath, mode = 'w', encoding = 'utf-8') as file:
			for line in errorlog:
				file.write(line + '\n')

	# Write phrase counts.

	# with open(phrasecountpath, mode="w", encoding = "utf-8") as file:
	#     j = json.dumps(phrasecount)
	#     file.write(j)

	print("Done.")
	pool.close()
	pool.join()
Пример #7
0
print()
# print("ROUGH MICROACCURACY:")
# print(roughaccuracy)
print("SMOOTHED MICROACCURACY:")
print(smoothaccuracy)
print("COALESCED MICROACCURACY:")
print(coalaccuracy)

with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv", mode = "w", encoding="utf-8") as f:
	f.write("htid\taccuracy\n")
	for key, value in accuracies.items():
		outline = key + "\t" + str(value) + "\n"
		f.write(outline)

metadatapath = os.path.join(firstdir, "predictionMetadata.tsv")
rowindices, columns, metadata = utils.readtsv(metadatapath)

metadatatable['maxprob']= metadata['maxprob']
metadatatable['gap'] = metadata['gap']
metadatatable['accuracy'] = accuracies
metadatatable['dissent'] = dissentperfile

data = pd.DataFrame(metadatatable, dtype = "float")

data['intercept'] = 1.0
train_cols = data.columns[1:]
logit = sm.Logit(data['accuracy'], data[train_cols])
result = logit.fit()
print(result.summary())
predictions = result.predict(data[train_cols])
print(pearsonr(data['accuracy'], predictions))
##

import os, sys
import SonicScrewdriver as utils
import random

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

initialsample = random.sample(rowindices, 2000)

directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures")
existingfiles = list()

for filename in directorylist:
	if filename.startswith(".") or filename.startswith("_"):
		continue

	htid = utils.pairtreelabel(filename[0:-7])
	existingfiles.append(htid)

counter = 0
toremove = list()
for htid in initialsample:
	if htid in existingfiles:
		counter +=1
		toremove.append(htid)

print("Found " + str(counter) + " duplicates.")
for htid in toremove:
	initialsample.remove(htid)
Пример #9
0
# refine fiction

import SonicScrewdriver as utils


def passfilter(genrestring):
    fields = genrestring.split(';')
    if "Autobiography" in fields or "Biography" in fields:
        return False
    else:
        return True


rows19c, columns19c, table19c = utils.readtsv(
    '/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv(
    '/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt",
          encoding='utf-8') as f:
    filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
    if anid in rows19c:
        title = fields[2]
        author = fields[3] + ', ' + fields[4]
        date = fields[8]
        filename = idcode + '.txt'
        filepath = os.path.join(sourcedir, filename)
        if os.path.isfile(filepath):
            tokencount, wordcount = count_words(filepath)
        else:
            print("Missing file: " + filepath)
            sys.exit(0)
        newrow = [idcode, date, tokencount, wordcount, author, title]
        outtable.append(newrow)
        print(counter)
        counter += 1

rows, columns, table = utils.readtsv(
    '/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
        title = fields[2]
        author = fields[3] + ', ' + fields[4]
        date = fields[8]
        filename = idcode + '.txt'
        filepath = os.path.join(sourcedir, filename)
        if os.path.isfile(filepath):
            tokencount, wordcount = count_words(filepath)
        else:
            print("Missing file: " + filepath)
            sys.exit(0)
        newrow = [idcode, date, tokencount, wordcount, author, title]
        outtable.append(newrow)
        print(counter)
        counter += 1

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv(
    "/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

    featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
    featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
    copyfile(featuresource, featuredestination)

    genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
    genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
    with open(genresource, mode="r", encoding="utf-8") as f:
        filelines = f.readlines()

    with open(genredestination, mode="w", encoding="utf-8") as f:
        for line in filelines:
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")

options = ["non", "bio", "poe", "dra", "fic"]

def censor(htid, genresequence):

	htid = utils.pairtreelabel(htid)
	# convert the htid into a dirty pairtree label for metadata matching

	# Create a dictionary with entries for all possible conditions, initially set negative.
	symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
	reported = dict()
	for symptom in symptoms:
		reported[symptom] = 0

	couldbefiction = True

	# Now we need to assess the largest genre in this volume.
	genrecounts = dict()
	genrecounts['fic'] = 0
	genrecounts['poe'] = 0
	genrecounts['dra'] = 0
	genrecounts['non'] = 0
Пример #14
0
# Figures out what call numbers mean for genre

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv(
    "/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

options = ["non", "bio", "poe", "dra", "fic"]

modelindices, modelcolumns, modeldata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")


def keywithmaxval(dictionary):
    maxval = 0
    maxkey = ""

    for key, value in dictionary.items():
        if value > maxval:
            maxval = value
            maxkey = key

    return maxkey


def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''
# plotter

import matplotlib.pyplot as plt
import SonicScrewdriver as utils
import pandas as pd
from scipy.stats.stats import pearsonr

indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv")

indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/MachineConfidence.tsv")

for idx in indices:
	if idx not in indices2:
		print(idx + " is missing.")

makeframe = dict()

makeframe["human-agreement"] = agreement["agreement"]
makeframe["machine-confidence"] = confidence["accuracy"]

df = pd.DataFrame(makeframe, dtype="float")
df = df.dropna()

print(str(pearsonr(df["human-agreement"], df["machine-confidence"])))

plt.plot(df["human-agreement"], df["machine-confidence"], "r.")
plt.xlabel("Human agreement")
plt.ylabel("Machine confidence")
plt.axis([0,1,0,1])
plt.show()
Пример #16
0
else:
	with open(slicepath, encoding = 'utf-8') as f:
		filelines = f.readlines()
	idstoget = set([x.strip() for x in filelines])
	getall = False
	startdir = 0
	enddir = 100

with open(wordpath, encoding = 'utf-8') as f:
	filelines = f.readlines()
wordstoget = set([x.strip() for x in filelines])

metafile = '/projects/ichass/usesofscale/hathimeta/MergedMonographs.tsv'

rows, columns, table = utils.readtsv(metafile)

subdirectories = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))]

wordcounts = dict()
counter = 0

if slicepath in directoryslices:
	outputpath = os.path.join(outdir, slicepath + ".tsv")
else:
	outputpath = os.path.join(outdir, "extracted_words.tsv")

# Get a dictionary so you can count dictionary words.

lexicon = set()
Пример #17
0
    for char in locnum:
        if char.isalpha():
            letterstring += char.upper()
        else:
            break
    if len(letterstring) > 2:
        letterstring = letterstring[:2]

    if len(letterstring) > 1 and letterstring[0] == "N":
        letterstring = "N"
    if len(letterstring) > 1 and letterstring[0] == "V":
        letterstring = "V"

    return letterstring

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

with open("/Users/tunder/Dropbox/pagedata/litlocs.tsv", encoding="utf-8") as f:
    filelines = f.readlines()
litlocs = dict()
for line in filelines:
    line = line.strip()
    fields = line.split('\t')
    litlocs[fields[0]] = int(round(1000 * float(fields[1])))

with open("/Users/tunder/Dropbox/pagedata/biolocs.tsv", encoding="utf-8") as f:
    filelines = f.readlines()
biolocs = dict()
for line in filelines:
    line = line.strip()
    fields = line.split('\t')
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

options = ["non", "bio", "poe", "dra", "fic"]

with open("/Users/tunder/Dropbox/pagedata/litlocs.tsv", encoding="utf-8") as f:
    filelines = f.readlines()
litlocs = dict()
for line in filelines:
    line = line.strip()
    fields = line.split('\t')
    litlocs[fields[0]] = int(round(1000 * float(fields[1])))

with open("/Users/tunder/Dropbox/pagedata/biolocs.tsv", encoding="utf-8") as f:
    filelines = f.readlines()
biolocs = dict()
for line in filelines:
    line = line.strip()
    fields = line.split('\t')
    biolocs[fields[0]] = int(round(1000 * float(fields[1])))

def letterpart(locnum):
    if locnum == "<blank>":
        return "<blank>"

    letterstring = ""
    for char in locnum:
Пример #19
0
reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv'
with open(reviews) as f:
    reader = csv.reader(f)
    for fields in reader:
        htid = fields[0]
        if htid == "HTid":
            continue
        jgenre = fields[13]
        date = int(fields[1])

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv(
    '/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()
Пример #20
0
# sort_anovaset.py

import SonicScrewdriver as utils
import csv

rows, columns, table = utils.readtsv(
    '/Volumes/TARDIS/work/metadata/19cmetadata.tsv')

with open('anovaset.txt', encoding='utf-8') as f:
    filelines = f.readlines()
    wholeset = [x.rstrip() for x in filelines]

the19c = list()
the20c = list()

for anid in wholeset:
    if anid in rows:
        the19c.append(anid)
    else:
        the20c.append(anid)

with open('anova19c.txt', mode='w', encoding='utf-8') as f:
    for anid in the19c:
        f.write(anid + '\n')

with open('anova20c.txt', mode='w', encoding='utf-8') as f:
    for anid in the20c:
        f.write(anid + '\n')
Пример #21
0
def main():
	global testrun, datapath, slicepath, metadatapath, current_working,  metaoutpath, errorpath, pagevocabset

	if testrun:
		filelist = os.listdir(datapath)
		HTIDs = set()
		for afilename in filelist:
			if not (afilename.startswith(".") or afilename.startswith("_")):
				HTIDs.add(afilename)

	else:
		with open(slicepath, encoding="utf-8") as file:
			HTIDlist = file.readlines()

		HTIDs = set([x.rstrip() for x in HTIDlist])
		del HTIDlist

	## discard bad volume IDs

	with open(metadatapath + "badIDs.txt", encoding = 'utf-8') as file:
		filelines = file.readlines()

	for line in filelines:
		line = line.rstrip()
		line = line.split(delim)
		if line[0] in HTIDs:
			HTIDs.discard(line[0])

	if not os.path.isfile(metaoutpath):
		with open(metaoutpath, 'w', encoding = 'utf-8') as f:
			f.write("volID\ttotalwords\tprematched\tpreenglish\tpostmatched\tpostenglish\n")

	print(len(HTIDs))

	# Let's get some metadata to create metadata features.

	if testrun:
		rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
	else:
		rowindices, columns, metadata = utils.readtsv("/projects/ichass/usesofscale/hathimeta/ExtractedMetadata.tsv")

	metadata_clues = list()
	for aHTID in HTIDs:
		evidence = get_metadata_evidence(aHTID, rowindices, columns, metadata)
		metadata_clues.append(evidence)

	assert len(HTIDs) == len(metadata_clues)
	file_tuples = zip(HTIDs, metadata_clues)

	pool = Pool(processes = 12)
	res = pool.map_async(process_a_file, file_tuples)

	# After all files are processed, write metadata, errorlog, and counts of phrases.
	res.wait()
	resultlist = res.get()

	processedmeta = list()
	errorlog = list()
	phrasecount = dict()

	for file_dict in resultlist:
		processedmeta.append(file_dict["metadata"])
		errorlog.extend(file_dict["errors"])
		htid = file_dict["htid"]

	# Metadata.

	with open(metaoutpath, mode = 'a', encoding = 'utf-8') as file:
		for metatuple in processedmeta:
			outlist = [x for x in metatuple]
			outline = delim.join(outlist) + '\n'
			file.write(outline)

	# Write the errorlog.

	if len(errorlog) > 0:
		with open(errorpath, mode = 'w', encoding = 'utf-8') as file:
			for line in errorlog:
				file.write(line + '\n')

	# Write phrase counts.

	# with open(phrasecountpath, mode="w", encoding = "utf-8") as file:
	#     j = json.dumps(phrasecount)
	#     file.write(j)

	print("Done.")
	pool.close()
	pool.join()
# Figures out what call numbers mean for genre

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

options = ["non", "bio", "poe", "dra", "fic"]

modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")

def keywithmaxval(dictionary):
    maxval = 0
    maxkey = ""

    for key, value in dictionary.items():
        if value > maxval:
            maxval = value
            maxkey = key

    return maxkey

def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

	featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
	featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
	copyfile(featuresource, featuredestination)

	genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
	genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
	with open(genresource, mode="r", encoding = "utf-8") as f:
		filelines = f.readlines()

	with open(genredestination, mode="w", encoding = "utf-8") as f:
		for line in filelines:
			line = line.rstrip()
reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv'
with open(reviews) as f:
    reader = csv.reader(f)
    for fields in reader:
        htid = fields[0]
        if htid == "HTid":
            continue
        jgenre = fields[13]
        date = int(fields[1])

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()
Пример #25
0
# sort_anovaset.py

import SonicScrewdriver as utils
import csv

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv')

with open('anovaset.txt', encoding = 'utf-8') as f:
    filelines = f.readlines()
    wholeset = [x.rstrip() for x in filelines]

the19c = list()
the20c = list()

for anid in wholeset:
    if anid in rows:
        the19c.append(anid)
    else:
        the20c.append(anid)

with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the19c:
        f.write(anid + '\n')

with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the20c:
        f.write(anid + '\n')



Пример #26
0
        if htid in wordcountsbyfile:
            wordcountsbyfile[htid].append(count)
        else:
            wordcountsbyfile[htid] = [count]

    return wordcountsbyfile

# Begin main script.

TOL = 0.1
THRESH = 0.80

genrestocheck = ['fic', 'poe', 'dra']

metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadatapath)

firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/"
secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/"

firstmaps = os.listdir(firstsource)
secondmaps = os.listdir(secondsource)

firstwordcounts = loadwordcounts(firstsource)
secondwordcounts = loadwordcounts(secondsource)

predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/'

predicts = os.listdir(predictsource)
predicts = [x for x in predicts if not x.startswith('.')]
modelfolder = "/Volumes/TARDIS/work/moneycontext/"
modelpath = modelfolder + "logisticmodel.p"
with open(modelpath, mode = 'rb') as f:
    logisticmodel = pickle.load(f)

standardizerpath = modelfolder + 'standardizer.p'
with open(standardizerpath, mode = 'rb') as f:
    standardizer = pickle.load(f)

featurepath = modelfolder + 'featurelist.p'
with open(featurepath, mode = 'rb') as f:
    features = pickle.load(f)

# Now load HathiTrust metadata.

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv')

ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}

moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'}

# Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses
# seemed rare enough relative to others that they'd be more likely to introduce noise than to help.
# |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters
# a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very
# good reason.

wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'}

# This is by no means an exhaustive list. Owe, loan, borrowed, etc.
# If we really want to get at the full range of words potentially
Пример #28
0
# make fiction subset

import SonicScrewdriver as utils

rows, columns, table = utils.readtsv("/Users/tunder/Dropbox/bookNLP/metadata/enrichedmetadataDec6.tsv")

datedict = dict()

selected = []

for row in rows:
	date = int(table["date"][row])

	if date in datedict:
		datedict[date] += 1
	else:
		datedict[date] = 1

	if datedict[date] > 3:
		continue
	else:
		selected.append(row)

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/fictionsubset.txt", mode='w', encoding = 'utf-8') as f:
	for line in selected:
		f.write(line + '\n')



# plotter

import matplotlib.pyplot as plt
import SonicScrewdriver as utils
import pandas as pd
from scipy.stats.stats import pearsonr

indices, columns, agreement = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv")

indices2, columns2, confidence = utils.readtsv("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv")

for idx in indices:
	if idx not in indices2:
		print(idx + " is missing.")

makeframe = dict()

makeframe["human-agreement"] = agreement["agreement"]
makeframe["machine-accuracy"] = confidence["accuracy"]

df = pd.DataFrame(makeframe, dtype="float")
df = df.dropna()

print(str(pearsonr(df["human-agreement"], df["machine-accuracy"])))

plt.plot(df["human-agreement"], df["machine-accuracy"], "r.")
plt.xlabel("Human agreement")
plt.ylabel("Machine accuracy")
plt.axis([0,1.02,0,1.02])
plt.show()
Пример #30
0
def add_counts(wordcounts, year, word, count):
	if year in wordcounts:

		if word in wordcounts[year]:
			wordcounts[year][word] += count
		else:
			wordcounts[year][word] = count

	else:
		wordcounts[year] = dict()
		wordcounts[year][word] = count


metafile = '/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv'
rows, columns, table = utils.readtsv(metafile)

dateindex = dict()

for volid in rows:
	startdate = table["startdate"][volid]
	enddate = table["enddate"][volid]
	textdate = table["textdate"][volid]

	intdate = infer_date(startdate, enddate, textdate)

	if intdate >= 1750 and intdate <= 1950:
		if intdate in dateindex:
			dateindex[intdate].append(volid)
		else:
			dateindex[intdate] = [volid]
Пример #31
0
# print(roughaccuracy)
print("SMOOTHED MICROACCURACY:")
print(smoothaccuracy)
print("COALESCED MICROACCURACY:")
print(coalaccuracy)

with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv",
          mode="w",
          encoding="utf-8") as f:
    f.write("htid\taccuracy\n")
    for key, value in accuracies.items():
        outline = key + "\t" + str(value) + "\n"
        f.write(outline)

metadatapath = os.path.join(firstdir, "predictionMetadata.tsv")
rowindices, columns, metadata = utils.readtsv(metadatapath)

metadatatable['maxprob'] = metadata['maxprob']
metadatatable['gap'] = metadata['gap']
metadatatable['accuracy'] = accuracies
metadatatable['dissent'] = dissentperfile

data = pd.DataFrame(metadatatable, dtype="float")

data['intercept'] = 1.0
train_cols = data.columns[1:]
logit = sm.Logit(data['accuracy'], data[train_cols])
result = logit.fit()
print(result.summary())
predictions = result.predict(data[train_cols])
print(pearsonr(data['accuracy'], predictions))
Пример #32
0
# epistolarymetadata.py

# This module ingests metadata created by Clara Mount in the summer of 2014,
# and uses it to return information about genre in a group of novels. We
# are especially interested in the boundary between epistolary and
# non-epistolary fiction, which can be configured in a variety of ways.

import SonicScrewdriver as utils
import numpy as np

epindices, epcolumns, epmetadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/classify/HathiGenreInfo-Epist.txt")

nonindices, noncolumns, nonmetadata = utils.readtsv(
    "/Users/tunder/Dropbox/PythonScripts/classify/HathiGenreInfo-NonEpist2.txt"
)


def get_genrevector(volumeIDs, boundarydef):
    global epindices, nonindices

    n = len(volumeIDs)

    genrevector = np.zeros(n)

    if boundarydef == "nonepistolary / epistolary":

        for idx, volID in enumerate(volumeIDs):
            cleanID = utils.pairtreelabel(volID)

            if cleanID in epindices: