def genre_gridsearch(modelname, c_range, ftstart, ftend, ftstep, positive_tags = ['elite'], negative_tags = ['vulgar'], excl_below = 1700, excl_above = 2000): # Function does a gridsearch to identify an optimal number of features and setting of # the regularization constant; then produces that model. sourcefolder = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/fromEF/' extension = '.tsv' #metadatapath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/prestigeficmeta.csv' metadatapath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/littlemagazines.csv' vocabpath = '/Users/tunder/Dropbox/fiction/lexicon/' + modelname + '.txt' if os.path.exists(vocabpath): print('Vocabulary for ' + modelname + ' already exists. Using it.') outputpath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/' + modelname + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = excl_below excludeabove['firstpub'] = excl_above sizecap = 700 # CLASSIFY CONDITIONS # print() # print("You can also specify positive tags to be excluded from training, and/or a pair") # print("of integer dates outside of which vols should be excluded from training.") # print("If you add 'donotmatch' to the list of tags, these volumes will not be") # print("matched with corresponding negative volumes.") # print() # ## testphrase = input("Comma-separated list of such tags: ") testphrase = '' testconditions = set([x.strip() for x in testphrase.split(',') if len(x) > 0]) datetype = "firstpub" numfeatures = ftend regularization = .000075 # linting the code would get rid of regularization, which is at this # point an unused dummy parameter paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) modelparams = 'logistic', 12, ftstart, ftend, ftstep, c_range matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model(paths, exclusions, classifyconditions, modelparams) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) tiltaccuracy = train.diachronic_tilt(allvolumes, 'linear', []) print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
def genre_gridsearch(metadatapath, modelname, c_range, ftstart, ftend, ftstep, positive_tags = ['fic'], negative_tags = ['bio'], excl_below = 1700, excl_above = 2000): # Function does a gridsearch to identify an optimal number of features and setting of # the regularization constant; then produces that model. sourcefolder = '../sourcefiles/' extension = '.tsv' vocabpath = '../lexicons/' + modelname + '.txt' if os.path.exists(vocabpath): print('Vocabulary for ' + modelname + ' already exists. Using it.') outputpath = '../modeloutput/' + modelname + '.csv' # We can simply exclude volumes from consideration on the basis on any # metadata category we want, using the dictionaries defined below. ## EXCLUSIONS. excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() excludebelow['firstpub'] = excl_below excludeabove['firstpub'] = excl_above sizecap = 75 # CLASSIFY CONDITIONS # print() # print("You can also specify positive tags to be excluded from training, and/or a pair") # print("of integer dates outside of which vols should be excluded from training.") # print("If you add 'donotmatch' to the list of tags, these volumes will not be") # print("matched with corresponding negative volumes.") # print() # ## testphrase = input("Comma-separated list of such tags: ") testphrase = '' testconditions = set([x.strip() for x in testphrase.split(',') if len(x) > 0]) datetype = "firstpub" numfeatures = ftend regularization = .000075 # linting the code would get rid of regularization, which is at this # point an unused dummy parameter paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) modelparams = 'logistic', 12, ftstart, ftend, ftstep, c_range matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model(paths, exclusions, classifyconditions, modelparams) print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) tiltaccuracy = train.diachronic_tilt(allvolumes, 'linear', []) print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
def gridsearch_a_model(metadatapath, sourcefolder, c_range, ftstart, ftend, ftstep, positive_tags=['f'], negative_tags=['m']): ''' Function does a gridsearch to identify an optimal number of features and setting of the regularization constant; then produces that model. Note that we do not use this for models of specific decades. Just initially for model selection.''' modelname = metadatapath.replace('.//models/', '').replace('_meta.csv', '') extension = '.tsv' vocabpath = metadatapath.replace('_meta', '_vocab') if os.path.exists(vocabpath): print('Vocabulary for ' + modelname + ' already exists. Using it.') outputpath = metadatapath.replace('_meta', '') ## EXCLUSIONS. # not used in this project excludeif = dict() excludeifnot = dict() excludeabove = dict() excludebelow = dict() sizecap = 2000 # CLASSIFY CONDITIONS # not used in this project testconditions = set() datetype = "firstpub" numfeatures = ftend regularization = .000075 # linting the code would get rid of regularization, which is at this # point an unused dummy parameter paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath) exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap) classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions) modelparams = 'logistic', 12, ftstart, ftend, ftstep, c_range matrix, rawaccuracy, allvolumes, coefficientuples = train.tune_a_model( paths, exclusions, classifyconditions, modelparams) print( 'If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy)) tiltaccuracy = train.diachronic_tilt(allvolumes, 'linear', []) print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))