def get_ratio_data(vocabpath, sizecap, ratio, tags4positive, tags4negative, excludebelow=0, excludeabove=3000): ''' Loads metadata, selects instances for the positive and negative classes (using a ratio to dilute the positive class with negative instances), creates a lexicon if one doesn't already exist, and creates a pandas dataframe storing texts as rows and words/features as columns. A refactored and simplified version of get_data_for_model(). ''' holdout_authors = True freqs_already_normalized = True verbose = False datecols = ['firstpub'] indexcol = ['docid'] extension = '.tsv' genrecol = 'tags' numfeatures = 8000 sourcefolder = '../data/' metadatapath = '../metadata/mastermetadata.csv' # Get a list of files. allthefiles = os.listdir(sourcefolder) volumeIDsinfolder = list() volumepaths = list() numchars2trim = len(extension) for filename in allthefiles: if filename.endswith(extension): volID = filename[0:-numchars2trim] # The volume ID is basically the filename minus its extension. volumeIDsinfolder.append(volID) metadata = metaselector.load_metadata(metadatapath, volumeIDsinfolder, excludebelow, excludeabove, indexcol=indexcol, datecols=datecols, genrecol=genrecol) # That function returns a pandas dataframe which is guaranteed to be indexed by indexcol, # and to contain a numeric column 'std_date' as well as a column 'tagset' which contains # sets of genre tags for each row. It has also been filtered so it only contains volumes # in the folder, and none whose date is below excludebelow or above excludeabove. orderedIDs, classdictionary = metaselector.dilute_positive_class( metadata, sizecap, tags4positive, tags4negative, ratio) metadata = metadata.loc[orderedIDs] # Limits the metadata data frame to rows we are actually using # (those selected in select_instances). # We now create an ordered list of id-path tuples. volspresent = [(x, sourcefolder + x + extension) for x in orderedIDs] print(len(volspresent)) print('Building vocabulary.') vocablist = versatiletrainer2.get_vocablist(vocabpath, volspresent, n=numfeatures) numfeatures = len(vocablist) print() print("Number of features: " + str(numfeatures)) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. authormatches = [[] for x in orderedIDs] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadata.loc[anid, 'author'] authormatches[idx1] = list( np.flatnonzero(metadata['author'] == thisauthor)) for alist in authormatches: alist.sort(reverse=True) print() print('Authors matched.') print() # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. masterdata, classvector = versatiletrainer2.get_dataframe( volspresent, classdictionary, vocablist, freqs_already_normalized) return metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist
def get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, forbid4positive = {'allnegative'}, forbid4negative = {'allpositive'}, excludebelow = 0, excludeabove = 3000, verbose = False, datecols = ['firstpub'], indexcol = ['docid'], extension = '.tsv', genrecol = 'tags', numfeatures = 5000, negative_strategy = 'random', overlap_strategy = 'random',force_even_distribution = False, forbiddenwords = set()): ''' Loads metadata, selects instances for the positive and negative classes, creates a lexicon if one doesn't already exist, and creates a pandas dataframe storing texts as rows and words/features as columns. A refactored and simplified version of get_data_for_model(). ''' holdout_authors = True # Keeps works by author X out of the test set when she's in the # training set. In production, always run with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. freqs_already_normalized = True # By default we assume that frequencies have already been normalized # (divided by the total number of words in the volume). This allows us # to use some features (like type/token ratio) that would become # meaningless if we divided everything by total wordcount. But it means # offloading some important feature-engineering decisions to the # data prep stage. # The following function confirms that the testconditions are legal. if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # Get a list of files. allthefiles = os.listdir(sourcefolder) volumeIDsinfolder = list() volumepaths = list() numchars2trim = len(extension) for filename in allthefiles: if filename.endswith(extension): volID = filename[0 : -numchars2trim] # The volume ID is basically the filename minus its extension. volumeIDsinfolder.append(volID) metadata = metaselector.load_metadata(metadatapath, volumeIDsinfolder, excludebelow, excludeabove, indexcol = indexcol, datecols = datecols, genrecol = genrecol) # That function returns a pandas dataframe which is guaranteed to be indexed by indexcol, # and to contain a numeric column 'std_date' as well as a column 'tagset' which contains # sets of genre tags for each row. It has also been filtered so it only contains volumes # in the folder, and none whose date is below excludebelow or above excludeabove. orderedIDs, classdictionary = metaselector.select_instances(metadata, sizecap, tags4positive, tags4negative, forbid4positive, forbid4negative, negative_strategy = negative_strategy, overlap_strategy = overlap_strategy, force_even_distribution = force_even_distribution) metadata = metadata.loc[orderedIDs] # Limits the metadata data frame to rows we are actually using # (those selected in select_instances). minimumdate = min(metadata.std_date) maximumdate = max(metadata.std_date) print() print(str(len(orderedIDs)) + " volumes range in date from " + str(minimumdate) + " to " + str(maximumdate) + ".") print() # We now create an ordered list of id-path tuples. volspresent = [(x, sourcefolder + x + extension) for x in orderedIDs] print('Building vocabulary.') vocablist = get_vocablist(vocabpath, volspresent, n = numfeatures, forbidden = forbiddenwords) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top n words, by doc frequency, in the volumes # we're using. if numfeatures > len(vocablist): print('Vocabulary capped at ' + str(len(vocablist)) + ' because no more') print('were available at ' + vocabpath) numfeatures = len(vocablist) print() print("Number of features: " + str(numfeatures)) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. authormatches = [ [] for x in orderedIDs] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadata.loc[anid, 'author'] authormatches[idx1] = list(np.flatnonzero(metadata['author'] == thisauthor)) for alist in authormatches: alist.sort(reverse = True) print() print('Authors matched.') print() # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. masterdata, classvector = get_dataframe(volspresent, classdictionary, vocablist, freqs_already_normalized) return metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist
def get_ratio_data(vocabpath, sizecap, ratio, tags4positive, tags4negative, excludebelow = 0, excludeabove = 3000): ''' Loads metadata, selects instances for the positive and negative classes (using a ratio to dilute the positive class with negative instances), creates a lexicon if one doesn't already exist, and creates a pandas dataframe storing texts as rows and words/features as columns. A refactored and simplified version of get_data_for_model(). ''' holdout_authors = True freqs_already_normalized = True verbose = False datecols = ['firstpub'] indexcol = ['docid'] extension = '.tsv' genrecol = 'tags' numfeatures = 8000 sourcefolder = '../data/' metadatapath = '../metadata/mastermetadata.csv' # Get a list of files. allthefiles = os.listdir(sourcefolder) volumeIDsinfolder = list() volumepaths = list() numchars2trim = len(extension) for filename in allthefiles: if filename.endswith(extension): volID = filename[0 : -numchars2trim] # The volume ID is basically the filename minus its extension. volumeIDsinfolder.append(volID) metadata = metaselector.load_metadata(metadatapath, volumeIDsinfolder, excludebelow, excludeabove, indexcol = indexcol, datecols = datecols, genrecol = genrecol) # That function returns a pandas dataframe which is guaranteed to be indexed by indexcol, # and to contain a numeric column 'std_date' as well as a column 'tagset' which contains # sets of genre tags for each row. It has also been filtered so it only contains volumes # in the folder, and none whose date is below excludebelow or above excludeabove. orderedIDs, classdictionary = metaselector.dilute_positive_class(metadata, sizecap, tags4positive, tags4negative, ratio) metadata = metadata.loc[orderedIDs] # Limits the metadata data frame to rows we are actually using # (those selected in select_instances). # We now create an ordered list of id-path tuples. volspresent = [(x, sourcefolder + x + extension) for x in orderedIDs] print(len(volspresent)) print('Building vocabulary.') vocablist = versatiletrainer2.get_vocablist(vocabpath, volspresent, n = numfeatures) numfeatures = len(vocablist) print() print("Number of features: " + str(numfeatures)) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. authormatches = [ [] for x in orderedIDs] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadata.loc[anid, 'author'] authormatches[idx1] = list(np.flatnonzero(metadata['author'] == thisauthor)) for alist in authormatches: alist.sort(reverse = True) print() print('Authors matched.') print() # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. masterdata, classvector = versatiletrainer2.get_dataframe(volspresent, classdictionary, vocablist, freqs_already_normalized) return metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist