def project_tags(tagstoproject, tagtargets): targetstring = ','.join(tagtargets) projectstring = ','.join(tagstoproject) print('First we create a model of ' + targetstring) sizecap = 400 modelname = targetstring + 'byitself' paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths exclusions = make_exclusions(0, 2000, sizecap, tagstoproject) # Note that we exclude tagstoproject from the negative contrast set, so the # contrast sets for the two models will be identical. positive_tags = tagtargets negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print('Then we create a model of ' + projectstring + ' and use it to predict ' + targetstring) modelname = projectstring + 'predicts' + targetstring paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths exclusions = make_exclusions(0, 2000, sizecap, 'nonegatives') positive_tags = list(tagtargets) positive_tags.extend(tagstoproject) testconditions = set(tagtargets) # That's the line that actually excludes tagtarget from training. classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)
def project_gothic_beyond_date(dividedate): print('First we create a model of gothic fiction only after ' + str(dividedate)) sizecap = 300 modelname = 'gothicjustpost' + str(dividedate) paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths exclusions = make_exclusions(dividedate, 2000, sizecap, 'nonegatives') positive_tags = ['lochorror', 'pbgothic', 'locghost', 'stangothic', 'chihorror'] negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print('Then we create a model of gothic fiction blindly predicting after ' + str(dividedate)) modelname = 'gothicpredictpost' + str(dividedate) paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths exclusions = make_exclusions(0, 2001, sizecap, 'nonegatives') testconditions = {'1700', str(dividedate)} classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)
def the_red_and_the_black(): sizecap = 140 modelname = 'blackandthered' paths = make_paths(modelname) exclusions = make_exclusions(1700, 2001, sizecap, 'nonegatives') positive_tags = ['teamred'] negative_tags = ['teamblack'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) accuracies = [] for i in range(40): modelname = 'redandtheblack' + str(i) paths = make_paths(modelname) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print(rawaccuracy) accuracies.append(rawaccuracy) with open('finalaccuracies.csv', mode = 'w', encoding = 'utf-8') as f: for accuracy in accuracies: f.write(str(accuracy) + '\n')
def model_taglist_within_dates(positive_tags, modelname, mindate, maxdate): print('We are modeling these positive tags:') for tag in positive_tags: print(tag) sizecap = 1000 paths = make_paths(modelname) sourcefolder, extension, metadatapath, outputpath, vocabpath = paths exclusions = make_exclusions(mindate, maxdate, sizecap, 'nonegatives') negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) return allvolumes
def generic(): sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/' extension = '.tsv' metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv' vocabpath = '/Users/tunder/Dropbox/character/meta/predict1960-79vocab.csv' modelname = input('Name of model? ') outputpath = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() daterange = input('Range of dates to use in the model? ') if ',' in daterange: dates = [int(x.strip()) for x in daterange.split(',')] dates.sort() if len(dates) == 2: assert dates[0] < dates[1] excludebelow['firstpub'] = dates[0] excludeabove['firstpub'] = dates[1] sizecap = 1000 # CLASSIFY CONDITIONS # We ask the user for a list of categories to be included in the positive # set, as well as a list for the negative set. Default for the negative set # is to include all the "random"ly selected categories. Note that random volumes # can also be tagged with various specific genre tags; they are included in the # negative set only if they lack tags from the positive set. positive_tags = ['f'] negative_tags = ['m'] datetype = "firstpub" numfeatures = 1700 regularization = .00011 testconditions = set() paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) tiltaccuracy = logisticpredict.diachronic_tilt(allvolumes, 'linear', []) print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
def calibrate_detective(): ''' Tests accuracy of classification for detective fiction at different sample sizes. ''' modelname = 'calibratedet' paths = make_paths(modelname) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1700 excludeabove['firstpub'] = 2020 positive_tags = ['locdetective', 'locdetmyst', 'chimyst', 'locdetmyst', 'det100'] negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) sizes = [5,6,7,8,9,11,13,15,17,18,21,27,29,32,34,36,40,45,50,55,60,65,70,75,80,85,90,100] # with open('../results/collateddetectiveaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f: # f.write('sizecap\tavgsize\trawaccuracy\n') accuracies = [] for sizecap in sizes: exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) trainsizes = [] for vol in allvolumes: trainsizes.append(vol[11]) # this is unfortunately dependent on the exact way # logisticpredict formats its output avgsize = sum(trainsizes) / len(trainsizes) print(sizecap, avgsize, rawaccuracy) with open('../final/collateddetaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f: f.write(str(sizecap) + '\t' + str(avgsize) + '\t' + str(rawaccuracy) + '\n') return None
def replicate_stew(): sizecap = 140 modelname = 'replicatestew' paths = make_paths(modelname) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1700 excludeabove['firstpub'] = 2020 allstewgenres = {'cozy', 'hardboiled', 'det100', 'chimyst', 'locdetective', 'lockandkey', 'crime', 'locdetmyst', 'blcrime', 'anatscifi', 'locscifi', 'chiscifi', 'femscifi', 'stangothic', 'pbgothic', 'lochorror', 'chihorror', 'locghost'} # We have to explicitly exclude genres because the category "stew" in the # positive category wouldn't otherwise automatically exclude the constituent # tags that were used to create it. # I would just have put all those tags in the positive tag list, but then you'd lose # the ability to explicitly balance equal numbers of crime, gothic, # and science fiction, plus sensation novels. You'd get a list dominated by # the crime categories, which are better-represented in the dataset. excludeif['negatives'] = allstewgenres exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) positive_tags = ['stew'] negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) accuracies = [] for i in range(20): rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print(rawaccuracy) accuracies.append(rawaccuracy) with open('stewaccuracies.csv', mode = 'a', encoding = 'utf-8') as f: for accuracy in accuracies: f.write(str(accuracy) + '\n')
def predictecco(): sourcefolder = '../sourcefiles/' extension = '.tsv' metadatapath = '../metadata/eccogenremeta.csv' modelname = 'predictecco' outputpath = '../modeloutput/' + modelname + str(datetime.date.today()) + '.csv' vocabpath = '../lexicons/' + modelname + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1700 excludeabove['firstpub'] = 1800 sizecap = 75 # CLASSIFY CONDITIONS positive_tags = ['fic'] negative_tags = ['bio'] # testconditions = {'0', '2001', 'poe'} testconditions = set() datetype = "firstpub" numfeatures = 1100 regularization = .015 paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) with open('../modeloutput/eccopredicts.csv', mode = 'w', encoding = 'utf-8') as f: f.write('row,accuracy,auc\n') for i in range(15): rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) auc = calculate_auc(allvolumes) print(i, rawaccuracy, auc) with open('../modeloutput/eccopredicts.csv', mode = 'a', encoding = 'utf-8') as f: f.write(str(i) + ',' + str(rawaccuracy) + ',' + str(auc) + '\n')
def compare(dividedate): print('First we create a model of gender only after ' + str(dividedate)) sizecap = 500 modelname = 'post' + str(dividedate) sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/' extension = '.tsv' metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv' vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv' outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str( datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1900 excludeabove['firstpub'] = 1950 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) positive_tags = ['f'] negative_tags = ['m'] testconditions = set() datetype = "firstpub" numfeatures = 2000 regularization = .00009 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model( paths, exclusions, classifyconditions) print( 'If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print( 'Then we create a model of detective fiction blindly predicting after ' + str(dividedate)) modelname = 'predictpost' + str(dividedate) outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str( datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath) excludebelow['firstpub'] = 1780 excludeabove['firstpub'] = 2000 sizecap = 1000 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) testconditions = {'1700', 1880} classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model( paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)
def generic(): sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/' extension = '.tsv' metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv' vocabpath = '/Users/tunder/Dropbox/character/meta/predict1960-79vocab.csv' modelname = input('Name of model? ') outputpath = '/Users/tunder/Dropbox/character/results/' + modelname + str( datetime.date.today()) + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() daterange = input('Range of dates to use in the model? ') if ',' in daterange: dates = [int(x.strip()) for x in daterange.split(',')] dates.sort() if len(dates) == 2: assert dates[0] < dates[1] excludebelow['firstpub'] = dates[0] excludeabove['firstpub'] = dates[1] sizecap = 1000 # CLASSIFY CONDITIONS # We ask the user for a list of categories to be included in the positive # set, as well as a list for the negative set. Default for the negative set # is to include all the "random"ly selected categories. Note that random volumes # can also be tagged with various specific genre tags; they are included in the # negative set only if they lack tags from the positive set. positive_tags = ['f'] negative_tags = ['m'] datetype = "firstpub" numfeatures = 1700 regularization = .00011 testconditions = set() paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model( paths, exclusions, classifyconditions) print( 'If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) tiltaccuracy = logisticpredict.diachronic_tilt(allvolumes, 'linear', []) print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
def predictfic(): ## PATHS. sourcefolder = '../sourcefiles/' extension = '.tsv' metadatapath = 'allgenremeta.csv' for floor in range(1720, 1920, 10): ceiling = floor + 19 modelname = 'predictfic' + str(floor) outputpath = '../results/' + modelname + str(datetime.date.today()) + '.csv' vocabpath = 'vocab/predictfic' + str(floor) + '.txt' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = floor excludeabove['firstpub'] = ceiling # We have to explicitly exclude genres because the category "stew" in the # positive category wouldn't otherwise automatically exclude the constituent # tags that were used to create it. # I would just have put all those tags in the positive tag list, but then you'd lose # the ability to explicitly balance equal numbers of crime, gothic, # and science fiction, plus sensation novels. You'd get a list dominated by # the crime categories, which are better-represented in the dataset. sizecap = 100 # CLASSIFY CONDITIONS # We ask the user for a list of categories to be included in the positive # set, as well as a list for the negative set. Default for the negative set # is to include all the "random"ly selected categories. Note that random volumes # can also be tagged with various specific genre tags; they are included in the # negative set only if they lack tags from the positive set. positive_tags = ['fic'] negative_tags = ['bio'] #testconditions = {'0', '2001', 'poe'} testconditions = set() datetype = "firstpub" numfeatures = 3000 regularization = .00008 paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) with open('../results/ficpredicts.csv', mode = 'a', encoding = 'utf-8') as f: f.write(str(floor) + ',' + str(rawaccuracy) + '\n')
def predict2017test(): ## PATHS. sourcefolder = '/Users/tunder/Dropbox/python/nonfic/2017test/' extension = '.tsv' metadatapath = '2017testmeta.csv' vocabpath = 'vocab/2017testvocab.txt' modelname = '2017testmodel' outputpath = '../results/' + modelname + str(datetime.date.today()) + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1700 excludeabove['firstpub'] = 2000 # We have to explicitly exclude genres because the category "stew" in the # positive category wouldn't otherwise automatically exclude the constituent # tags that were used to create it. # I would just have put all those tags in the positive tag list, but then you'd lose # the ability to explicitly balance equal numbers of crime, gothic, # and science fiction, plus sensation novels. You'd get a list dominated by # the crime categories, which are better-represented in the dataset. sizecap = 400 # CLASSIFY CONDITIONS # We ask the user for a list of categories to be included in the positive # set, as well as a list for the negative set. Default for the negative set # is to include all the "random"ly selected categories. Note that random volumes # can also be tagged with various specific genre tags; they are included in the # negative set only if they lack tags from the positive set. positive_tags = ['fic'] negative_tags = ['bio'] # testconditions = {'0', '2001', 'poe', 'limit==250'} testconditions = set() datetype = "firstpub" numfeatures = 3200 regularization = .00008 paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) tiltaccuracy = logisticpredict.diachronic_tilt(allvolumes, 'linear', []) print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy)) return allvolumes
def calibrate_stew(): ''' Tests accuracy of classification for ghastly stew at different sample sizes. ''' modelname = 'calibratestew' paths = make_paths(modelname) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1700 excludeabove['firstpub'] = 2020 allstewgenres = {'cozy', 'hardboiled', 'det100', 'chimyst', 'locdetective', 'lockandkey', 'crime', 'locdetmyst', 'blcrime', 'anatscifi', 'locscifi', 'chiscifi', 'femscifi', 'stangothic', 'pbgothic', 'lochorror', 'chihorror', 'locghost'} # We have to explicitly exclude genres because the category "stew" in the # positive category wouldn't otherwise automatically exclude the constituent # tags that were used to create it. # I would just have put all those tags in the positive tag list, but then you'd lose # the ability to explicitly balance equal numbers of crime, gothic, # and science fiction, plus sensation novels. You'd get a list dominated by # the crime categories, which are better-represented in the dataset. excludeif['negatives'] = allstewgenres positive_tags = ['stew'] negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) sizes = [5,6,7,8,9,11,13,15,17,18,21,27,29,32,34,36,40,45,50,55,60,65,70,75,80,85,90,100] # with open('../results/collatedstewaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f: # f.write('sizecap\tavgsize\trawaccuracy\n') accuracies = [] for sizecap in sizes: exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) trainsizes = [] for vol in allvolumes: trainsizes.append(vol[11]) # this is unfortunately dependent on the exact way # logisticpredict formats its output avgsize = sum(trainsizes) / len(trainsizes) print(sizecap, avgsize, rawaccuracy) with open('../final/collatedstewaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f: f.write(str(sizecap) + '\t' + str(avgsize) + '\t' + str(rawaccuracy) + '\n') return None
def ghastly_stew(): ## PATHS. sourcefolder = '../newdata/' extension = '.fic.tsv' metadatapath = '../meta/finalmeta.csv' vocabpath = '../lexicon/new10k.csv' modelname = 'ghastlystew' outputpath = '../results/' + modelname + str(datetime.date.today()) + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1700 excludeabove['firstpub'] = 2020 allstewgenres = {'cozy', 'hardboiled', 'det100', 'chimyst', 'locdetective', 'lockandkey', 'crime', 'locdetmyst', 'blcrime', 'anatscifi', 'locscifi', 'chiscifi', 'femscifi', 'stangothic', 'pbgothic', 'lochorror', 'chihorror', 'locghost'} # We have to explicitly exclude genres because the category "stew" in the # positive category wouldn't otherwise automatically exclude the constituent # tags that were used to create it. # I would just have put all those tags in the positive tag list, but then you'd lose # the ability to explicitly balance equal numbers of crime, gothic, # and science fiction, plus sensation novels. You'd get a list dominated by # the crime categories, which are better-represented in the dataset. excludeif['negatives'] = allstewgenres sizecap = 250 # CLASSIFY CONDITIONS # We ask the user for a list of categories to be included in the positive # set, as well as a list for the negative set. Default for the negative set # is to include all the "random"ly selected categories. Note that random volumes # can also be tagged with various specific genre tags; they are included in the # negative set only if they lack tags from the positive set. positive_tags = ['stew'] negative_tags = ['random', 'chirandom'] testconditions = set() datetype = "firstpub" numfeatures = 10000 regularization = .000075 paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
def compare(dividedate): print('First we create a model of gender only after ' + str(dividedate)) sizecap = 500 modelname = 'post' + str(dividedate) sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/' extension = '.tsv' metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv' vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv' outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath) ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = 1900 excludeabove['firstpub'] = 1950 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) positive_tags = ['f'] negative_tags = ['m'] testconditions = set() datetype = "firstpub" numfeatures = 2000 regularization = .00009 classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) print() print('Then we create a model of detective fiction blindly predicting after ' + str(dividedate)) modelname = 'predictpost' + str(dividedate) outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv' paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath) excludebelow['firstpub'] = 1780 excludeabove['firstpub'] = 2000 sizecap = 1000 exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) testconditions = {'1700', 1880} classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions) print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy)) print() # Now we compare the predictions made by these two models, comparing only # the volumes that are in both models but excluded from the training process # in the second model. comparemodels.compare_untrained(outputpath1, outputpath2)