Пример #1
0
def get_ratio_data(vocabpath,
                   sizecap,
                   ratio,
                   tags4positive,
                   tags4negative,
                   excludebelow=0,
                   excludeabove=3000):
    ''' Loads metadata, selects instances for the positive
    and negative classes (using a ratio to dilute the positive
    class with negative instances), creates a lexicon if one doesn't
    already exist, and creates a pandas dataframe storing
    texts as rows and words/features as columns. A refactored
    and simplified version of get_data_for_model().
    '''

    holdout_authors = True
    freqs_already_normalized = True
    verbose = False
    datecols = ['firstpub']
    indexcol = ['docid']
    extension = '.tsv'
    genrecol = 'tags'
    numfeatures = 8000

    sourcefolder = '../data/'
    metadatapath = '../metadata/mastermetadata.csv'

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)

    volumeIDsinfolder = list()
    volumepaths = list()
    numchars2trim = len(extension)

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename[0:-numchars2trim]
            # The volume ID is basically the filename minus its extension.
            volumeIDsinfolder.append(volID)

    metadata = metaselector.load_metadata(metadatapath,
                                          volumeIDsinfolder,
                                          excludebelow,
                                          excludeabove,
                                          indexcol=indexcol,
                                          datecols=datecols,
                                          genrecol=genrecol)

    # That function returns a pandas dataframe which is guaranteed to be indexed by indexcol,
    # and to contain a numeric column 'std_date' as well as a column 'tagset' which contains
    # sets of genre tags for each row. It has also been filtered so it only contains volumes
    # in the folder, and none whose date is below excludebelow or above excludeabove.

    orderedIDs, classdictionary = metaselector.dilute_positive_class(
        metadata, sizecap, tags4positive, tags4negative, ratio)

    metadata = metadata.loc[orderedIDs]
    # Limits the metadata data frame to rows we are actually using
    # (those selected in select_instances).

    # We now create an ordered list of id-path tuples.

    volspresent = [(x, sourcefolder + x + extension) for x in orderedIDs]
    print(len(volspresent))

    print('Building vocabulary.')

    vocablist = versatiletrainer2.get_vocablist(vocabpath,
                                                volspresent,
                                                n=numfeatures)

    numfeatures = len(vocablist)

    print()
    print("Number of features: " + str(numfeatures))

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    authormatches = [[] for x in orderedIDs]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadata.loc[anid, 'author']
            authormatches[idx1] = list(
                np.flatnonzero(metadata['author'] == thisauthor))

    for alist in authormatches:
        alist.sort(reverse=True)

    print()
    print('Authors matched.')
    print()

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    masterdata, classvector = versatiletrainer2.get_dataframe(
        volspresent, classdictionary, vocablist, freqs_already_normalized)

    return metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist
def get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, forbid4positive = {'allnegative'}, forbid4negative = {'allpositive'}, excludebelow = 0, excludeabove = 3000, verbose = False, datecols = ['firstpub'], indexcol = ['docid'], extension = '.tsv', genrecol = 'tags', numfeatures = 5000, negative_strategy = 'random', overlap_strategy = 'random',force_even_distribution = False, forbiddenwords = set()):

    ''' Loads metadata, selects instances for the positive and
    negative classes, creates a lexicon if one doesn't
    already exist, and creates a pandas dataframe storing
    texts as rows and words/features as columns. A refactored
    and simplified version of get_data_for_model().
    '''

    holdout_authors = True

    # Keeps works by author X out of the test set when she's in the
    # training set. In production, always run with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference.

    freqs_already_normalized = True

    # By default we assume that frequencies have already been normalized
    # (divided by the total number of words in the volume). This allows us
    # to use some features (like type/token ratio) that would become
    # meaningless if we divided everything by total wordcount. But it means
    # offloading some important feature-engineering decisions to the
    # data prep stage.

    # The following function confirms that the testconditions are legal.

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)

    volumeIDsinfolder = list()
    volumepaths = list()
    numchars2trim = len(extension)

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename[0 : -numchars2trim]
            # The volume ID is basically the filename minus its extension.
            volumeIDsinfolder.append(volID)

    metadata = metaselector.load_metadata(metadatapath, volumeIDsinfolder, excludebelow, excludeabove, indexcol = indexcol, datecols = datecols, genrecol = genrecol)

    # That function returns a pandas dataframe which is guaranteed to be indexed by indexcol,
    # and to contain a numeric column 'std_date' as well as a column 'tagset' which contains
    # sets of genre tags for each row. It has also been filtered so it only contains volumes
    # in the folder, and none whose date is below excludebelow or above excludeabove.

    orderedIDs, classdictionary = metaselector.select_instances(metadata, sizecap, tags4positive, tags4negative, forbid4positive, forbid4negative, negative_strategy = negative_strategy, overlap_strategy = overlap_strategy, force_even_distribution = force_even_distribution)

    metadata = metadata.loc[orderedIDs]
    # Limits the metadata data frame to rows we are actually using
    # (those selected in select_instances).

    minimumdate = min(metadata.std_date)
    maximumdate = max(metadata.std_date)

    print()
    print(str(len(orderedIDs)) + " volumes range in date from " + str(minimumdate) + " to " + str(maximumdate) + ".")
    print()

    # We now create an ordered list of id-path tuples.

    volspresent = [(x, sourcefolder + x + extension) for x in orderedIDs]


    print('Building vocabulary.')

    vocablist = get_vocablist(vocabpath, volspresent, n = numfeatures, forbidden = forbiddenwords)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top n words, by doc frequency, in the volumes
    # we're using.

    if numfeatures > len(vocablist):
        print('Vocabulary capped at ' + str(len(vocablist)) + ' because no more')
        print('were available at ' + vocabpath)

    numfeatures = len(vocablist)

    print()
    print("Number of features: " + str(numfeatures))

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    authormatches = [ [] for x in orderedIDs]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadata.loc[anid, 'author']
            authormatches[idx1] = list(np.flatnonzero(metadata['author'] == thisauthor))

    for alist in authormatches:
        alist.sort(reverse = True)

    print()
    print('Authors matched.')
    print()

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    masterdata, classvector = get_dataframe(volspresent, classdictionary, vocablist, freqs_already_normalized)

    return metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist
def get_ratio_data(vocabpath, sizecap, ratio, tags4positive, tags4negative, excludebelow = 0, excludeabove = 3000):

    ''' Loads metadata, selects instances for the positive
    and negative classes (using a ratio to dilute the positive
    class with negative instances), creates a lexicon if one doesn't
    already exist, and creates a pandas dataframe storing
    texts as rows and words/features as columns. A refactored
    and simplified version of get_data_for_model().
    '''

    holdout_authors = True
    freqs_already_normalized = True
    verbose = False
    datecols = ['firstpub']
    indexcol = ['docid']
    extension = '.tsv'
    genrecol = 'tags'
    numfeatures = 8000

    sourcefolder = '../data/'
    metadatapath = '../metadata/mastermetadata.csv'

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)

    volumeIDsinfolder = list()
    volumepaths = list()
    numchars2trim = len(extension)

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename[0 : -numchars2trim]
            # The volume ID is basically the filename minus its extension.
            volumeIDsinfolder.append(volID)

    metadata = metaselector.load_metadata(metadatapath, volumeIDsinfolder, excludebelow, excludeabove, indexcol = indexcol, datecols = datecols, genrecol = genrecol)

    # That function returns a pandas dataframe which is guaranteed to be indexed by indexcol,
    # and to contain a numeric column 'std_date' as well as a column 'tagset' which contains
    # sets of genre tags for each row. It has also been filtered so it only contains volumes
    # in the folder, and none whose date is below excludebelow or above excludeabove.

    orderedIDs, classdictionary = metaselector.dilute_positive_class(metadata, sizecap, tags4positive, tags4negative, ratio)

    metadata = metadata.loc[orderedIDs]
    # Limits the metadata data frame to rows we are actually using
    # (those selected in select_instances).

    # We now create an ordered list of id-path tuples.

    volspresent = [(x, sourcefolder + x + extension) for x in orderedIDs]
    print(len(volspresent))

    print('Building vocabulary.')

    vocablist = versatiletrainer2.get_vocablist(vocabpath, volspresent, n = numfeatures)

    numfeatures = len(vocablist)

    print()
    print("Number of features: " + str(numfeatures))

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    authormatches = [ [] for x in orderedIDs]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadata.loc[anid, 'author']
            authormatches[idx1] = list(np.flatnonzero(metadata['author'] == thisauthor))

    for alist in authormatches:
        alist.sort(reverse = True)

    print()
    print('Authors matched.')
    print()

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    masterdata, classvector = versatiletrainer2.get_dataframe(volspresent, classdictionary, vocablist, freqs_already_normalized)

    return metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist