def get_data_for_model(paths, exclusions, classifyconditions): ''' Unpacks a bunch of parameters that define metadata conditions for positive and negative classes. Finds volumes meeting those conditions, creates a lexicon if one doesn't already exist, and creates a pandas dataframe storing texts as rows and words/features as columns. ''' sourcefolder, extension, metadatapath, outputpath, vocabpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions verbose = False holdout_authors = True # If you want reliable results, always run this with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. If you do that, it # disables the code that keeps other works by the author being predicted # out of the training set. freqs_already_normalized = True # By default we assume that frequencies have already been normalized # (divided by the total number of words in the volume). This allows us # to use some features (like type/token ratio) that would become # meaningless if divided by total wordcount. But it means that I'm # offloading some important feature-engineering decisions to the # data prep stage. # The following function confirms that the testconditions are legal. confirm_testconditions(testconditions, positive_tags) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # RANDOMNESS. # random.shuffle(allthefiles) # RANDOMNESS. This is an important line. Without it, you'd get the same sequence of # orderedIDs each time, and the same distribution of IDs into folds of the cross- # validation volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. if type(positive_tags[0]).__name__ == 'int': categorytodivide = 'firstpub' else: categorytodivide = 'tagset' IDsToUse, classdictionary, donttrainset = metafilter.label_classes( metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions) print() min, max = first_and_last(IDsToUse, metadict, datetype) if min > 0: print("The whole corpus involved here includes " + str(len(IDsToUse))) print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".") print() # We now create an ordered list of id-path tuples for later use, and identify a set of # positive ids that should never be used in training. volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse) # Extend the set of ids not to be used in training by identifying negative volumes that match # the distribution of positive volumes. describe_donttrainset(donttrainset, classdictionary, metadict, datetype) # Create a flag for each volume that indicates whether it was used in training record_trainflags(metadict, donttrainset) # Get a count of docfrequency for all words in the corpus. This is probably not needed and # might be deprecated later. # wordcounts = get_docfrequency(volspresent, donttrainset) # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. vocablist = get_vocablist(vocabpath, volspresent, useall=True, n=numfeatures) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top 10k words in all files, and stores it there. # N is a parameter that could be altered right here. # Useall is a parameter that you basically don't need to worry about unless # you're changing / testing code. If you set it to false, the vocablist will # exclude words that occur very rarely. This shouldn't be necessary; the # crossvalidation routine is designed not to include features that occur # zero times in the training set. But if you get div-by-zero errors in the # training process, you could fiddle with this parameter as part of a # troubleshooting process. numfeatures = len(vocablist) print() print("Number of features " + str(numfeatures)) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. # This list will include for ALL volumes, the indexes of vols in the donttrainset. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[ idx1]: authormatches[idx1].append(idx2) else: # This code only runs if we're testing the effect of # holdout_authors by disabling it. for idx1, anid in enumerate(orderedIDs): if idx1 not in authormatches[idx1]: authormatches[idx1].append(idx1) # The purpose of everything that follows is to # balance negative and positive instances in each # training set. trainingpositives = set() trainingnegatives = set() for anid, thisclass in classdictionary.items(): if anid in donttrainset: continue if thisclass == 1: trainingpositives.add(orderedIDs.index(anid)) else: trainingnegatives.add(orderedIDs.index(anid)) print('Training positives: ' + str(len(trainingpositives))) print('Training negatives: ' + str(len(trainingnegatives))) for alist in authormatches: alist.sort(reverse=True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. masterdata, classvector, metadict = get_dataframe( metadict, volspresent, classdictionary, vocablist, freqs_already_normalized) return metadict, masterdata, classvector, classdictionary, orderedIDs, donttrainon, donttrainset, authormatches, vocablist
def create_model(paths, exclusions, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, metadatapath, outputpath, vocabpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions verbose = False holdout_authors = True # If you want reliable results, always run this with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. If you do that, it # disables the code that keeps other works by the author being predicted # out of the training set. # The following function confirms that the testconditions are legal. confirm_testconditions(testconditions, positive_tags) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. if type(positive_tags[0]).__name__ == 'int': categorytodivide = 'firstpub' else: categorytodivide = 'tagset' IDsToUse, classdictionary, donttrainset = metafilter.label_classes( metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions) print() min, max = first_and_last(IDsToUse, metadict, datetype) if min > 0: print("The whole corpus involved here includes " + str(len(IDsToUse))) print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".") print() # We now create an ordered list of id-path tuples for later use, and identify a set of # positive ids that should never be used in training. volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse) # Extend the set of ids not to be used in training by identifying negative volumes that match # the distribution of positive volumes. describe_donttrainset(donttrainset, classdictionary, metadict, datetype) # Create a flag for each volume that indicates whether it was used in training record_trainflags(metadict, donttrainset) # Get a count of docfrequency for all words in the corpus. This is probably not needed and # might be deprecated later. wordcounts = get_docfrequency(volspresent, donttrainset) # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. vocablist = get_vocablist(vocabpath, volspresent, wordcounts, useall=True, n=numfeatures) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top 10k words in all files, and stores it there. # N is a parameter that could be altered right here. # Useall is a parameter that you basically don't need to worry about unless # you're changing / testing code. If you set it to false, the vocablist will # exclude words that occur very rarely. This shouldn't be necessary; the # crossvalidation routine is designed not to include features that occur # zero times in the training set. But if you get div-by-zero errors in the # training process, you could fiddle with this parameter as part of a # troubleshooting process. numfeatures = len(vocablist) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. # This list will include for ALL volumes, the indexes of vols in the donttrainset. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[ idx1]: authormatches[idx1].append(idx2) else: # This code only runs if we're testing the effect of # holdout_authors by disabling it. for idx1, anid in enumerate(orderedIDs): if idx1 not in authormatches[idx1]: authormatches[idx1].append(idx1) # The purpose of everything that follows is to # balance negative and positive instances in each # training set. trainingpositives = set() trainingnegatives = set() for anid, thisclass in classdictionary.items(): if anid in donttrainset: continue if thisclass == 1: trainingpositives.add(orderedIDs.index(anid)) else: trainingnegatives.add(orderedIDs.index(anid)) print('Training positives: ' + str(len(trainingpositives))) print('Training negatives: ' + str(len(trainingnegatives))) # The code below was intended to balance the size of positive and # negative in spite of same-author exclusions. But it could # have grossly unintended effects when there were many donttrainon # exclusions. # for alist in authormatches: # numpositive = 0 # numnegative = 0 # for anidx in alist: # anid = orderedIDs[anidx] # thisclass = classdictionary[anid] # if thisclass == 1: # numpositive += 1 # else: # numnegative += 1 # if numpositive > numnegative: # difference = numpositive - numnegative # remaining = trainingnegatives - set(alist) # alist.extend(random.sample(remaining, difference)) # elif numpositive < numnegative: # difference = numnegative - numpositive # remaining = trainingpositives - set(alist) # alist.extend(random.sample(remaining, difference)) # else: # difference = 0 # Let's record, for each volume, the size of its training set. trainingsizes = [] numvolumes = len(orderedIDs) for idx, anid in enumerate(orderedIDs): excluded = len(authormatches[idx]) metadict[anid]['trainsize'] = numvolumes - excluded trainingsizes.append(metadict[anid]['trainsize']) averagetrainingsize = sum(trainingsizes) / len(trainingsizes) for alist in authormatches: alist.sort(reverse=True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding='utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = metautils.infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) if totalcount == 0: totalcount = .00001 voldata.append(features / totalcount) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes=11) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) header = [ 'volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'trainsize', 'author', 'title', 'genretags' ] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] dateused = metadata[datetype] pubdate = metadata['pubdate'] birthdate = metadata['birthdate'] firstpub = metadata['firstpub'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] trainflag = metadata['trainflag'] trainsize = metadata['trainsize'] genretags = ' | '.join(metadata['tagset']) outrow = [ volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, trainflag, trainsize, author, title, genretags ] writer.writerow(outrow) allvolumes.append(outrow) if logistic == 0.5: print("equals!") predictedpositive = random.sample([True, False], 1)[0] elif logistic > 0.5: predictedpositive = True elif logistic < 0.5: predictedpositive = False else: print('Oh, joy. A fundamental floating point error.') predictedpositive = random.sample([True, False], 1)[0] if predictedpositive and classdictionary[volid] > 0.5: truepositives += 1 elif not predictedpositive and classdictionary[volid] < 0.5: truenegatives += 1 elif not predictedpositive and classdictionary[volid] > 0.5: falsenegatives += 1 elif predictedpositive and classdictionary[volid] < 0.5: falsepositives += 1 else: print("Wait a second, boss.") donttrainon.sort(reverse=True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) trainingset, testset = modelingprocess.remove_zerocols( trainingset, testset) newmodel = LogisticRegression(C=regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list( zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives if totalevaluated != len(IDsToUse): print("Total evaluated = " + str(totalevaluated)) print("But we've got " + str(len(IDsToUse))) accuracy = (truepositives + truenegatives) / totalevaluated print('True positives ' + str(truepositives)) print('True negatives ' + str(truenegatives)) print('False positives ' + str(falsepositives)) print('False negatives ' + str(falsenegatives)) print() print('The average size of the training set was ' + str(averagetrainingsize)) print() precision = truepositives / (truepositives + falsepositives) recall = truepositives / (truepositives + falsenegatives) F1 = 2 * (precision * recall) / (precision + recall) print("F1 : " + str(F1)) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
def make_dunnings(paths, exclusions, thresholds, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, classpath, outputpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions pastthreshold, futurethreshold = thresholds category2sorton, positive_class, datetype, numfeatures, regularization = classifyconditions verbose = False if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # The default condition here is # category2sorton = 'reviewed' # positive_class = 'rev' # sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.label_classes( metadict, category2sorton, positive_class, sizecap) # make a vocabulary list and a volsize dict wordcounts = Counter() volspresent = list() orderedIDs = list() positivecounts = dict() negativecounts = dict() for volid, volpath in zip(volumeIDs, volumepaths): if volid not in IDsToUse: continue else: volspresent.append((volid, volpath)) orderedIDs.append(volid) date = infer_date(metadict[volid], datetype) if date < pastthreshold or date > futurethreshold: continue else: with open(volpath, encoding='utf-8') as f: for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: # print(line) continue word = fields[0] if len(word) > 0 and word[0].isalpha(): count = int(fields[1]) wordcounts[word] += 1 # for initial feature selection we use the number of # *documents* that contain a given word, # so it's just +=1. vocablist = [x[0] for x in wordcounts.most_common(numfeatures)] # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000) # Feature selection is deprecated. There are cool things # we could do with feature selection, # but they'd improve accuracy by 1% at the cost of complicating our explanatory task. # The tradeoff isn't worth it. Explanation is more important. # So we just take the most common words (by number of documents containing them) # in the whole corpus. Technically, I suppose, we could crossvalidate that as well, # but *eyeroll*. donttrainon = list() # Here we create a list of volumed IDs not to be used for training. # For instance, we have supplemented the dataset with volumes that # are in the Norton but that did not actually occur in random # sampling. We want to make predictions for these, but never use # them for training. for idx1, anid in enumerate(orderedIDs): reviewedstatus = metadict[anid]['reviewed'] date = infer_date(metadict[anid], datetype) if reviewedstatus == 'addedbecausecanon': donttrainon.append(idx1) elif date < pastthreshold or date > futurethreshold: donttrainon.append(idx1) authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # For every index in authormatches, identify a set of indexes that have # the same author. Obvs, there will always be at least one. # Since we are going to use these indexes to exclude rows, we also add # all the ids in donttrainon to every volume for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) for alist in authormatches: alist.sort(reverse=True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. randomdata = list() revieweddata = list() for volid, volpath in volspresent: with open(volpath, encoding='utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 classflag = classdictionary[volid] features = get_features(voldict, vocablist) if classflag == 0: randomdata.append(features) else: revieweddata.append(features) randomdata = pd.DataFrame(randomdata) revieweddata = pd.DataFrame(revieweddata) randomrows = randomdata.shape[0] randsum = 0 for i in range(randomrows): randsum += sum(randomdata.iloc[i, :]) reviewedrows = revieweddata.shape[0] revsum = 0 for i in range(reviewedrows): revsum += sum(revieweddata.iloc[i, :]) dunningdict = dict() for idx, word in enumerate(vocablist): signed_dunnings, bns, ratio, mwu, mwp = dunnings( randomdata, randsum, revieweddata, revsum, idx) dunningdict[word] = (signed_dunnings, bns, ratio, mwu, mwp) with open('dunnings.csv', mode='w', encoding='utf-8') as f: writer = csv.DictWriter( f, fieldnames=['word', 'dunnings', 'bns', 'ratio', 'mwu', 'mwp']) writer.writeheader() for word, value in dunningdict.items(): row = dict() row['word'] = word row['dunnings'] = value[0] row['bns'] = value[1] row['ratio'] = value[2] row['mwu'] = value[3] row['mwp'] = value[4] writer.writerow(row)
def create_model(paths, exclusions, thresholds, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, classpath, outputpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions pastthreshold, futurethreshold = thresholds category2sorton, positive_class, datetype = classifyconditions verbose = False if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # The default condition here is # category2sorton = 'reviewed' # positive_class = 'rev' # sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.label_classes( metadict, category2sorton, positive_class, sizecap) # make a vocabulary list and a volsize dict wordcounts = Counter() volspresent = list() orderedIDs = list() positivecounts = dict() negativecounts = dict() for volid, volpath in zip(volumeIDs, volumepaths): if volid not in IDsToUse: continue else: volspresent.append((volid, volpath)) orderedIDs.append(volid) date = infer_date(metadict[volid], datetype) if date < pastthreshold or date > futurethreshold: continue else: with open(volpath, encoding='utf-8') as f: for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: # print(line) continue word = fields[0] if len(word) > 0 and word[0].isalpha(): count = int(fields[1]) wordcounts[word] += 1 # for initial feature selection we use the number of # *documents* that contain a given word, # so it's just +=1. vocablist = [x[0] for x in wordcounts.most_common(3200)] #vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000) # Deprecated. There are more sophisticated things we could do with feature selection, # but they'd improve accuracy by 1% at the cost of complicating our explanatory task. # The tradeoff isn't worth it. Explanation is more important. VOCABSIZE = len(vocablist) donttrainon = list() # Here we create a list of volumed IDs not to be used for training. # For instance, we have supplemented the dataset with volumes that # are in the Norton but that did not actually occur in random # sampling. We want to make predictions for these, but never use # them for training. for idx1, anid in enumerate(orderedIDs): reviewedstatus = metadict[anid]['reviewed'] date = infer_date(metadict[anid], datetype) if reviewedstatus == 'addedbecausecanon': donttrainon.append(idx1) elif date < pastthreshold or date > futurethreshold: donttrainon.append(idx1) authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # For every index in authormatches, identify a set of indexes that have # the same author. Obvs, there will always be at least one. # Since we are going to use these indexes to exclude rows, we also add # all the ids in donttrainon to every volume for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) for alist in authormatches: alist.sort(reverse=True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding='utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) voldata.append(features / (totalcount + 0.001)) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) fivetuples = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] afivetuple = data, classvector, listtoexclude, i, usedate fivetuples.append(afivetuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes=12) res = pool.map_async(modelingprocess.model_one_volume, fivetuples) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) header = [ 'volid', 'reviewed', 'obscure', 'pubdate', 'birthdate', 'gender', 'nation', 'allwords', 'logistic', 'author', 'title', 'pubname', 'actually', 'realclass' ] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] reviewed = metadata['reviewed'] obscure = metadata['obscure'] pubdate = infer_date(metadata, datetype) birthdate = metadata['birthdate'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] canonicity = metadata['canonicity'] pubname = metadata['pubname'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] outrow = [ volid, reviewed, obscure, pubdate, birthdate, gender, nation, allwords, logistic, author, title, pubname, canonicity, realclass ] writer.writerow(outrow) allvolumes.append(outrow) if logistic > 0.5 and classdictionary[volid] > 0.5: truepositives += 1 elif logistic <= 0.5 and classdictionary[volid] < 0.5: truenegatives += 1 elif logistic <= 0.5 and classdictionary[volid] > 0.5: falsenegatives += 1 elif logistic > 0.5 and classdictionary[volid] < 0.5: falsepositives += 1 donttrainon.sort(reverse=True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) newmodel = LogisticRegression(C=.00007) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 1000000 coefficientuples = list( zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() accuracy = (truepositives + truenegatives) / len(IDsToUse) with open('coefficients.csv', mode='w', encoding='utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
def create_model(paths, exclusions, thresholds, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, classpath, outputpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions pastthreshold, futurethreshold = thresholds category2sorton, positive_class, datetype, numfeatures, regularization = classifyconditions verbose = False if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # The default condition here is # category2sorton = 'reviewed' # positive_class = 'rev' # sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton, positive_class, sizecap) # make a vocabulary list and a volsize dict wordcounts = Counter() volspresent = list() orderedIDs = list() positivecounts = dict() negativecounts = dict() for volid, volpath in zip(volumeIDs, volumepaths): if volid not in IDsToUse: continue else: volspresent.append((volid, volpath)) orderedIDs.append(volid) date = infer_date(metadict[volid], datetype) if date < pastthreshold or date > futurethreshold: continue else: with open(volpath, encoding = 'utf-8') as f: for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: # print(line) continue word = fields[0] if len(word) > 0 and word[0].isalpha(): count = float(fields[1]) wordcounts[word] += 1 # for initial feature selection we use the number of # *documents* that contain a given word, # so it's just +=1. if sourcefolder=="poems/": vocablist = [x[0] for x in wordcounts.most_common(numfeatures)] else: # In an SRT, we can just take them arbitrarily. The top ten is [V0,V1,V2,...,V10] vocablist = ["V" + str(i) for i in range(numfeatures)] # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000) # Feature selection is deprecated. There are cool things # we could do with feature selection, # but they'd improve accuracy by 1% at the cost of complicating our explanatory task. # The tradeoff isn't worth it. Explanation is more important. # So we just take the most common words (by number of documents containing them) # in the whole corpus. Technically, I suppose, we could crossvalidate that as well, # but *eyeroll*. donttrainon = list() # Here we create a list of volumed IDs not to be used for training. # For instance, we have supplemented the dataset with volumes that # are in the Norton but that did not actually occur in random # sampling. We want to make predictions for these, but never use # them for training. for idx1, anid in enumerate(orderedIDs): reviewedstatus = metadict[anid]['reviewed'] date = infer_date(metadict[anid], datetype) if reviewedstatus == 'addedbecausecanon': donttrainon.append(idx1) elif date < pastthreshold or date > futurethreshold: donttrainon.append(idx1) authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # For every index in authormatches, identify a set of indexes that have # the same author. Obvs, there will always be at least one. # Since we are going to use these indexes to exclude rows, we also add # all the ids in donttrainon to every volume for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) for alist in authormatches: alist.sort(reverse = True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding = 'utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = float(fields[1]) voldict[word] = count totalcount += count date = infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) if sourcefolder=="poems/": voldata.append(features / (totalcount + 0.001)) else: # For SRT transformations, normalization is already handled voldata.append(features) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes = 4) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) header = ['volid', 'reviewed', 'obscure', 'pubdate', 'birthdate', 'gender', 'nation', 'allwords', 'logistic', 'author', 'title', 'pubname', 'actually', 'realclass'] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] reviewed = metadata['reviewed'] obscure = metadata['obscure'] pubdate = infer_date(metadata, datetype) birthdate = metadata['birthdate'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] canonicity = metadata['canonicity'] pubname = metadata['pubname'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] outrow = [volid, reviewed, obscure, pubdate, birthdate, gender, nation, allwords, logistic, author, title, pubname, canonicity, realclass] writer.writerow(outrow) allvolumes.append(outrow) if logistic > 0.5 and classdictionary[volid] > 0.5: truepositives += 1 elif logistic <= 0.5 and classdictionary[volid] < 0.5: truenegatives += 1 elif logistic <= 0.5 and classdictionary[volid] > 0.5: falsenegatives += 1 elif logistic > 0.5 and classdictionary[volid] < 0.5: falsepositives += 1 donttrainon.sort(reverse = True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) newmodel = LogisticRegression(C = regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() accuracy = (truepositives + truenegatives) / len(IDsToUse) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
excludeif = dict() # excludeif['impaud'] = 'pop' excludeif['pubname'] = 'TEM' excludeif['recept'] = 'addcanon' #excludeif['gender'] = 'm' excludeifnot = dict() #excludeifnot['gender'] = 'm' excludeabove = dict() excludebelow = dict() excludebelow['inferreddate'] = 1700 excludeabove['inferreddate'] = 1950 futurethreshold = 1950 metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # IDsToUse = set([x for x in metadict.keys()]) # The default condition here is category2sorton = 'reviewed' positive_class = 'rev' sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.balance_classes(metadict, category2sorton, positive_class, sizecap)
excludeif = dict() # excludeif['impaud'] = 'pop' excludeif['pubname'] = 'TEM' excludeif['recept'] = 'addcanon' #excludeif['gender'] = 'm' excludeifnot = dict() #excludeifnot['gender'] = 'm' excludeabove = dict() excludebelow = dict() excludebelow['inferreddate'] = 1700 excludeabove['inferreddate'] = 1950 futurethreshold = 1950 metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # IDsToUse = set([x for x in metadict.keys()]) # The default condition here is category2sorton = 'reviewed' positive_class = 'rev' sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton, positive_class, sizecap)
def create_model(paths, exclusions, trainthresholds, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, classpath, outputpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions pastthreshold, futurethreshold, donottraintag = trainthresholds categorytodivideon, positive_tags, negative_tag, datetype, numfeatures, regularization = classifyconditions verbose = False if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. IDsToUse, classdictionary = metafilter.label_classes( metadict, categorytodivideon, positive_tags, negative_tag, sizecap, datetype) # make a vocabulary list and a volsize dict wordcounts = Counter() volspresent = list() orderedIDs = list() positivecounts = dict() negativecounts = dict() # At the same time we're going to create a set of volumes # that should never be included in the training set. donttrainset = set() for volid, volpath in zip(volumeIDs, volumepaths): if volid not in IDsToUse: continue else: volspresent.append((volid, volpath)) orderedIDs.append(volid) # The following two if statements catch volumes that should # be predicted but not trained on. # We add them to the donttrain on set, and also 'continue' so # they are not used to create vocabulary. date = infer_date(metadict[volid], datetype) if date < pastthreshold or date > futurethreshold: donttrainset.add(volid) continue tagset = metadict[volid]['tagset'] if donottraintag in tagset: donttrainset.add(volid) continue else: with open(volpath, encoding='utf-8') as f: for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: # print(line) continue word = fields[0] if len(word) > 0 and word[0].isalpha(): wordcounts[word] += 1 # for initial feature selection we use the number of # *documents* that contain a given word, # so it's just +=1. # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. # However, we don't want to include words that actually occur zero times in # the particular set we're modeling. So we check. vocablist = [] with open('../lexicon/top10k.csv', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['word'].strip() if wordcounts[word] > 2: vocablist.append(word) numfeatures = len(vocablist) # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000) # Feature selection is deprecated. There are cool things # we could do with feature selection, # but they'd improve accuracy by 1% at the cost of complicating our explanatory task. # The tradeoff isn't worth it. Explanation is more important. # We need a list of indexes in orderedIDs to exclude. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # For every index in authormatches, identify a set of indexes that have # the same author. Obvs, there will always be at least one. # Since we are going to use these indexes to exclude rows, we also add # all the ids in donttrainon to every volume for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) for alist in authormatches: alist.sort(reverse=True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding='utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) voldata.append(features / (totalcount + 0.001)) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes=12) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) header = [ 'volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'author', 'title', 'genretags' ] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] dateused = metadata[datetype] pubdate = metadata['pubdate'] birthdate = metadata['birthdate'] firstpub = metadata['firstpub'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] genretags = ' | '.join(metadata['tagset']) outrow = [ volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, author, title, genretags ] writer.writerow(outrow) allvolumes.append(outrow) if logistic > 0.5 and classdictionary[volid] > 0.5: truepositives += 1 elif logistic <= 0.5 and classdictionary[volid] < 0.5: truenegatives += 1 elif logistic <= 0.5 and classdictionary[volid] > 0.5: falsenegatives += 1 elif logistic > 0.5 and classdictionary[volid] < 0.5: falsepositives += 1 donttrainon.sort(reverse=True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) newmodel = LogisticRegression(C=regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list( zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() accuracy = (truepositives + truenegatives) / len(IDsToUse) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
def create_model(paths, exclusions, trainthresholds, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, classpath, outputpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions pastthreshold, futurethreshold, donottraintag = trainthresholds categorytodivideon, positive_tags, negative_tag, datetype, numfeatures, regularization = classifyconditions verbose = False if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. IDsToUse, classdictionary = metafilter.label_classes(metadict, categorytodivideon, positive_tags, negative_tag, sizecap, datetype) # make a vocabulary list and a volsize dict wordcounts = Counter() volspresent = list() orderedIDs = list() positivecounts = dict() negativecounts = dict() # At the same time we're going to create a set of volumes # that should never be included in the training set. donttrainset = set() for volid, volpath in zip(volumeIDs, volumepaths): if volid not in IDsToUse: continue else: volspresent.append((volid, volpath)) orderedIDs.append(volid) # The following two if statements catch volumes that should # be predicted but not trained on. # We add them to the donttrain on set, and also 'continue' so # they are not used to create vocabulary. date = infer_date(metadict[volid], datetype) if date < pastthreshold or date > futurethreshold: donttrainset.add(volid) continue tagset = metadict[volid]['tagset'] if donottraintag in tagset: donttrainset.add(volid) continue else: with open(volpath, encoding = 'utf-8') as f: for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: # print(line) continue word = fields[0] if len(word) > 0 and word[0].isalpha(): wordcounts[word] += 1 # for initial feature selection we use the number of # *documents* that contain a given word, # so it's just +=1. # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. # However, we don't want to include words that actually occur zero times in # the particular set we're modeling. So we check. vocablist = [] with open('../lexicon/top10k.csv', encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['word'].strip() if wordcounts[word] > 2: vocablist.append(word) numfeatures = len(vocablist) # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000) # Feature selection is deprecated. There are cool things # we could do with feature selection, # but they'd improve accuracy by 1% at the cost of complicating our explanatory task. # The tradeoff isn't worth it. Explanation is more important. # We need a list of indexes in orderedIDs to exclude. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # For every index in authormatches, identify a set of indexes that have # the same author. Obvs, there will always be at least one. # Since we are going to use these indexes to exclude rows, we also add # all the ids in donttrainon to every volume for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) for alist in authormatches: alist.sort(reverse = True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding = 'utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) voldata.append(features / (totalcount + 0.001)) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes = 12) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) header = ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'author', 'title', 'genretags'] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] dateused = metadata[datetype] pubdate = metadata['pubdate'] birthdate = metadata['birthdate'] firstpub = metadata['firstpub'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] genretags = ' | '.join(metadata['tagset']) outrow = [volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, author, title, genretags] writer.writerow(outrow) allvolumes.append(outrow) if logistic > 0.5 and classdictionary[volid] > 0.5: truepositives += 1 elif logistic <= 0.5 and classdictionary[volid] < 0.5: truenegatives += 1 elif logistic <= 0.5 and classdictionary[volid] > 0.5: falsenegatives += 1 elif logistic > 0.5 and classdictionary[volid] < 0.5: falsepositives += 1 donttrainon.sort(reverse = True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) newmodel = LogisticRegression(C = regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() accuracy = (truepositives + truenegatives) / len(IDsToUse) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
def get_data_for_model(paths, exclusions, classifyconditions): ''' Unpacks a bunch of parameters that define metadata conditions for positive and negative classes. Finds volumes meeting those conditions, creates a lexicon if one doesn't already exist, and creates a pandas dataframe storing texts as rows and words/features as columns. ''' sourcefolder, extension, metadatapath, outputpath, vocabpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions verbose = False holdout_authors = True # If you want reliable results, always run this with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. If you do that, it # disables the code that keeps other works by the author being predicted # out of the training set. freqs_already_normalized = True # By default we assume that frequencies have already been normalized # (divided by the total number of words in the volume). This allows us # to use some features (like type/token ratio) that would become # meaningless if divided by total wordcount. But it means that I'm # offloading some important feature-engineering decisions to the # data prep stage. # The following function confirms that the testconditions are legal. confirm_testconditions(testconditions, positive_tags) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # RANDOMNESS. # random.shuffle(allthefiles) # RANDOMNESS. This is an important line. Without it, you'd get the same sequence of # orderedIDs each time, and the same distribution of IDs into folds of the cross- # validation volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. if type(positive_tags[0]).__name__ == 'int': categorytodivide = 'firstpub' else: categorytodivide = 'tagset' IDsToUse, classdictionary, donttrainset = metafilter.label_classes(metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions) print() min, max = first_and_last(IDsToUse, metadict, datetype) if min > 0: print("The whole corpus involved here includes " + str(len(IDsToUse))) print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".") print() # We now create an ordered list of id-path tuples for later use, and identify a set of # positive ids that should never be used in training. volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse) # Extend the set of ids not to be used in training by identifying negative volumes that match # the distribution of positive volumes. describe_donttrainset(donttrainset, classdictionary, metadict, datetype) # Create a flag for each volume that indicates whether it was used in training record_trainflags(metadict, donttrainset) # Get a count of docfrequency for all words in the corpus. This is probably not needed and # might be deprecated later. # wordcounts = get_docfrequency(volspresent, donttrainset) # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. vocablist = get_vocablist(vocabpath, volspresent, useall = True, n = numfeatures) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top 10k words in all files, and stores it there. # N is a parameter that could be altered right here. # Useall is a parameter that you basically don't need to worry about unless # you're changing / testing code. If you set it to false, the vocablist will # exclude words that occur very rarely. This shouldn't be necessary; the # crossvalidation routine is designed not to include features that occur # zero times in the training set. But if you get div-by-zero errors in the # training process, you could fiddle with this parameter as part of a # troubleshooting process. numfeatures = len(vocablist) print() print("Number of features " + str(numfeatures)) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. # This list will include for ALL volumes, the indexes of vols in the donttrainset. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) else: # This code only runs if we're testing the effect of # holdout_authors by disabling it. for idx1, anid in enumerate(orderedIDs): if idx1 not in authormatches[idx1]: authormatches[idx1].append(idx1) # The purpose of everything that follows is to # balance negative and positive instances in each # training set. trainingpositives = set() trainingnegatives = set() for anid, thisclass in classdictionary.items(): if anid in donttrainset: continue if thisclass == 1: trainingpositives.add(orderedIDs.index(anid)) else: trainingnegatives.add(orderedIDs.index(anid)) print('Training positives: ' + str(len(trainingpositives))) print('Training negatives: ' + str(len(trainingnegatives))) for alist in authormatches: alist.sort(reverse = True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. masterdata, classvector, metadict = get_dataframe(metadict, volspresent, classdictionary, vocablist, freqs_already_normalized) return metadict, masterdata, classvector, classdictionary, orderedIDs, donttrainon, donttrainset, authormatches, vocablist
def create_model(paths, exclusions, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, metadatapath, outputpath, vocabpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions verbose = False holdout_authors = True # If you want reliable results, always run this with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. If you do that, it # disables the code that keeps other works by the author being predicted # out of the training set. # The following function confirms that the testconditions are legal. confirm_testconditions(testconditions, positive_tags) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. if type(positive_tags[0]).__name__ == 'int': categorytodivide = 'firstpub' else: categorytodivide = 'tagset' IDsToUse, classdictionary, donttrainset = metafilter.label_classes(metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions) print() min, max = first_and_last(IDsToUse, metadict, datetype) if min > 0: print("The whole corpus involved here includes " + str(len(IDsToUse))) print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".") print() # We now create an ordered list of id-path tuples for later use, and identify a set of # positive ids that should never be used in training. volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse) # Extend the set of ids not to be used in training by identifying negative volumes that match # the distribution of positive volumes. describe_donttrainset(donttrainset, classdictionary, metadict, datetype) # Create a flag for each volume that indicates whether it was used in training record_trainflags(metadict, donttrainset) # Get a count of docfrequency for all words in the corpus. This is probably not needed and # might be deprecated later. wordcounts = get_docfrequency(volspresent, donttrainset) # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. vocablist = get_vocablist(vocabpath, volspresent, wordcounts, useall = True, n = numfeatures) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top 10k words in all files, and stores it there. # N is a parameter that could be altered right here. # Useall is a parameter that you basically don't need to worry about unless # you're changing / testing code. If you set it to false, the vocablist will # exclude words that occur very rarely. This shouldn't be necessary; the # crossvalidation routine is designed not to include features that occur # zero times in the training set. But if you get div-by-zero errors in the # training process, you could fiddle with this parameter as part of a # troubleshooting process. numfeatures = len(vocablist) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. # This list will include for ALL volumes, the indexes of vols in the donttrainset. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) else: # This code only runs if we're testing the effect of # holdout_authors by disabling it. for idx1, anid in enumerate(orderedIDs): if idx1 not in authormatches[idx1]: authormatches[idx1].append(idx1) # The purpose of everything that follows is to # balance negative and positive instances in each # training set. trainingpositives = set() trainingnegatives = set() for anid, thisclass in classdictionary.items(): if anid in donttrainset: continue if thisclass == 1: trainingpositives.add(orderedIDs.index(anid)) else: trainingnegatives.add(orderedIDs.index(anid)) print('Training positives: ' + str(len(trainingpositives))) print('Training negatives: ' + str(len(trainingnegatives))) # The code below was intended to balance the size of positive and # negative in spite of same-author exclusions. But it could # have grossly unintended effects when there were many donttrainon # exclusions. # for alist in authormatches: # numpositive = 0 # numnegative = 0 # for anidx in alist: # anid = orderedIDs[anidx] # thisclass = classdictionary[anid] # if thisclass == 1: # numpositive += 1 # else: # numnegative += 1 # if numpositive > numnegative: # difference = numpositive - numnegative # remaining = trainingnegatives - set(alist) # alist.extend(random.sample(remaining, difference)) # elif numpositive < numnegative: # difference = numnegative - numpositive # remaining = trainingpositives - set(alist) # alist.extend(random.sample(remaining, difference)) # else: # difference = 0 # Let's record, for each volume, the size of its training set. trainingsizes = [] numvolumes = len(orderedIDs) for idx, anid in enumerate(orderedIDs): excluded = len(authormatches[idx]) metadict[anid]['trainsize'] = numvolumes - excluded trainingsizes.append(metadict[anid]['trainsize']) averagetrainingsize = sum(trainingsizes) / len(trainingsizes) for alist in authormatches: alist.sort(reverse = True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding = 'utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = metautils.infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) if totalcount == 0: totalcount = .00001 voldata.append(features / totalcount) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes = 11) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) header = ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'trainsize', 'author', 'title', 'genretags'] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] dateused = metadata[datetype] pubdate = metadata['pubdate'] birthdate = metadata['birthdate'] firstpub = metadata['firstpub'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] trainflag = metadata['trainflag'] trainsize = metadata['trainsize'] genretags = ' | '.join(metadata['tagset']) outrow = [volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, trainflag, trainsize, author, title, genretags] writer.writerow(outrow) allvolumes.append(outrow) if logistic == 0.5: print("equals!") predictedpositive = random.sample([True, False], 1)[0] elif logistic > 0.5: predictedpositive = True elif logistic < 0.5: predictedpositive = False else: print('Oh, joy. A fundamental floating point error.') predictedpositive = random.sample([True, False], 1)[0] if predictedpositive and classdictionary[volid] > 0.5: truepositives += 1 elif not predictedpositive and classdictionary[volid] < 0.5: truenegatives += 1 elif not predictedpositive and classdictionary[volid] > 0.5: falsenegatives += 1 elif predictedpositive and classdictionary[volid] < 0.5: falsepositives += 1 else: print("Wait a second, boss.") donttrainon.sort(reverse = True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) trainingset, testset = modelingprocess.remove_zerocols(trainingset, testset) newmodel = LogisticRegression(C = regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives if totalevaluated != len(IDsToUse): print("Total evaluated = " + str(totalevaluated)) print("But we've got " + str(len(IDsToUse))) accuracy = (truepositives + truenegatives) / totalevaluated print('True positives ' + str(truepositives)) print('True negatives ' + str(truenegatives)) print('False positives ' + str(falsepositives)) print('False negatives ' + str(falsenegatives)) print() print('The average size of the training set was ' + str(averagetrainingsize)) print() precision = truepositives / (truepositives + falsepositives) recall = truepositives / (truepositives + falsenegatives) F1 = 2 * (precision * recall) / (precision + recall) print("F1 : " + str(F1)) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
excludeif = dict() # excludeif['impaud'] = 'pop' excludeif['pubname'] = 'TEM' excludeif['recept'] = 'addcanon' #excludeif['gender'] = 'm' excludeifnot = dict() #excludeifnot['gender'] = 'm' excludeabove = dict() excludebelow = dict() excludebelow['inferreddate'] = 1700 excludeabove['inferreddate'] = 1950 futurethreshold = 1950 metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # IDsToUse = set([x for x in metadict.keys()]) # The default condition here is category2sorton = 'reviewed' positive_class = 'rev' sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton,
excludeif = dict() # excludeif['impaud'] = 'pop' excludeif['pubname'] = 'TEM' excludeif['recept'] = 'addcanon' #excludeif['gender'] = 'm' excludeifnot = dict() #excludeifnot['gender'] = 'm' excludeabove = dict() excludebelow = dict() excludebelow['inferreddate'] = 1700 excludeabove['inferreddate'] = 1950 futurethreshold = 1950 metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. If we want to us more or less everything, # this may not be necessary. But in some cases we want to use randomly sampled subsets. # IDsToUse = set([x for x in metadict.keys()]) # The default condition here is category2sorton = 'reviewed' positive_class = 'rev' sizecap = 350 # A sizecap less than one means, no sizecap. IDsToUse, classdictionary = metafilter.balance_classes(metadict,