def implement_assignment(assignment_file):

    assignments = dict()

    with open(assignment_file, encoding='utf-8') as f:
        for line in f:
            row = line.strip().split('\t')

            name = row[0].replace(': ', '')
            name = name.replace(' ', '')
            name = name.replace(',', '')

            positive_genres = [row[0]]

            if row[1] != 'self':
                positive_genres.append(row[1])
                exclusion = row[1].split('-Not-')[1]
                excludename = exclusion.replace(' ', '')
                excludename = excludename.replace(':', '')
                excludename = excludename.replace(',', '')
                name = name + '-Not-' + excludename

            assignments[name] = positive_genres

    sourcefolder = '../data/'
    sizecap = 100
    outmodels = '../results/crossmodels.tsv'

    for posname, assigned_positives in assignments.items():
        print()
        print(name, assigned_positives)
        print()

        if len(assigned_positives) > 1:
            exclusion = assigned_positives[1].split('-Not-')[1]
            exclusionB = exclusion + ' B'
            set2exclude = {exclusion, exclusionB}

        else:
            set2exclude = set()

        c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
        featurestart = 500
        featureend = 6800
        featurestep = 100
        modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range
        metadatapath = '../metadata/genremeta.csv'

        for contrast in ['randomA', 'randomB']:

            name = posname + '_' + contrast
            vocabpath = '../lexica/' + name + '.txt'
            tags4positive = set(assigned_positives)

            tags4negative = {contrast}
            floor = 1700
            ceiling = 2011

            checkpath = '../models/' + name + '.csv'
            if not os.path.isfile(checkpath):

                metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(
                    sourcefolder,
                    metadatapath,
                    vocabpath,
                    tags4positive,
                    tags4negative,
                    sizecap,
                    excludebelow=floor,
                    excludeabove=ceiling,
                    force_even_distribution=False,
                    negative_strategy='closely match',
                    numfeatures=6900,
                    forbid4positive=set2exclude,
                    forbid4negative=set())

                matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(
                    metadata, masterdata, classvector, classdictionary,
                    orderedIDs, authormatches, vocablist, tags4positive,
                    tags4negative, modelparams, name,
                    '../models/' + name + '.csv')

                meandate = int(
                    round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

                with open(outmodels, mode='a', encoding='utf-8') as f:
                    outline = name + '\t' + str(meandate) + '\t' + str(
                        maxaccuracy) + '\t' + str(features4max) + '\t' + str(
                            best_regularization_coef) + '\n'
                    f.write(outline)

                os.remove(vocabpath)
def reliable_genre_comparisons():
    '''
    This function was used in the current version of the article.

    It addresses weaknesses in earlier versions of genre comparison
    by comparing only models *with no shared instances*.

    [Edit Jan 1: To be even more careful about leakage, make that
    *no shared authors.*]

    Doing that required a ----load of complexity I'm afraid. I have to first
    split each genre into disjoint sets, then create self-comparisons between
    those disjoint sets, as well as cross-comparisons between genres, and then
    finally compare the self-comparisons to the cross-comparisons.
    '''

    outmodels = '../results/reliable_models.tsv'
    outcomparisons = '../results/reliable_comparisons.tsv'
    columns = [
        'testype', 'name1', 'name2', 'ceiling', 'floor', 'meandate1',
        'meandate2', 'acc1', 'acc2', 'alienacc1', 'alienacc2', 'spearman',
        'spear1on2', 'spear2on1', 'loss', 'loss1on2', 'loss2on1'
    ]

    if not os.path.isfile(outcomparisons):
        with open(outcomparisons, mode='a', encoding='utf-8') as f:
            scribe = csv.DictWriter(f, delimiter='\t', fieldnames=columns)
            scribe.writeheader()

    if not os.path.isfile(outmodels):
        with open(outmodels, mode='a', encoding='utf-8') as f:
            outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n'
            f.write(outline)

    sourcefolder = '../data/'
    sizecap = 72

    c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
    featurestart = 1500
    featureend = 6500
    featurestep = 300
    modelparams = 'logistic', 15, featurestart, featureend, featurestep, c_range

    master = pd.read_csv('../metadata/mastermetadata.csv', index_col='docid')
    periods = [(1800, 1909), (1880, 1924), (1900, 1949), (1910, 1959),
               (1930, 1969), (1950, 1979), (1970, 1989), (1980, 1999),
               (1990, 2010)]
    forbiddenwords = {'fantasy', 'fiction', 'science', 'horror'}

    # endpoints both inclusive

    for i in range(15):
        for floor, ceiling in periods:

            split_metadata(master, floor, ceiling, sizecap)

            # That function just above does the real work of preventing leakage,
            # by splitting the genre into two disjoint sets. This allows self-
            # comparisons that avoid shared authors, and are thus strictly
            # comparable to cross-comparisons.

            metaoptions = ['sf1', 'sf2', 'fant1', 'fant2']

            for m in metaoptions:
                metadatapath = '../temp/' + m + '.csv'
                vocabpath = '../lexica/' + m + '.txt'
                name = 'temp_' + m + str(ceiling) + '_' + str(i)

                if m == 'sf1' or m == 'sf2':
                    tags4positive = {'sf_loc', 'sf_oclc', 'sf_bailey'}
                else:
                    tags4positive = {'fantasy_loc', 'fantasy_oclc', 'supernat'}

                tags4negative = {'random', 'randomB'}

                metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(
                    sourcefolder,
                    metadatapath,
                    vocabpath,
                    tags4positive,
                    tags4negative,
                    sizecap,
                    excludebelow=floor,
                    excludeabove=ceiling,
                    forbid4positive={'juv'},
                    forbid4negative={'juv'},
                    force_even_distribution=False,
                    numfeatures=6500,
                    forbiddenwords=forbiddenwords)

                matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(
                    metadata, masterdata, classvector, classdictionary,
                    orderedIDs, authormatches, vocablist, tags4positive,
                    tags4negative, modelparams, name,
                    '../modeloutput/' + name + '.csv')

                meandate = int(
                    round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

                with open(outmodels, mode='a', encoding='utf-8') as f:
                    outline = name + '\t' + str(sizecap) + '\t' + str(
                        floor) + '\t' + str(ceiling) + '\t' + str(
                            meandate) + '\t' + str(maxaccuracy) + '\t' + str(
                                features4max) + '\t' + str(
                                    best_regularization_coef) + '\t' + str(
                                        i) + '\n'
                    f.write(outline)

                os.remove(vocabpath)

            r = dict()
            r['testype'] = 'sfself'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_sf2' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[
                'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[
                    'alienacc1'], r['alienacc2'], r['meandate1'], r[
                        'meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)

            r = dict()
            r['testype'] = 'fantasyself'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_fant1' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[
                'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[
                    'alienacc1'], r['alienacc2'], r['meandate1'], r[
                        'meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)

            r = dict()
            r['testype'] = 'cross'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[
                'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[
                    'alienacc1'], r['alienacc2'], r['meandate1'], r[
                        'meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)

            r = dict()
            r['testype'] = 'cross'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_sf2' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_fant1' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r[
                'loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r[
                    'alienacc1'], r['alienacc2'], r['meandate1'], r[
                        'meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)
def reliable_genre_comparisons():

    '''
    This function was used in the current version of the article.

    It addresses weaknesses in earlier versions of genre comparison
    by comparing only models *with no shared instances*.

    [Edit Jan 1: To be even more careful about leakage, make that
    *no shared authors.*]

    Doing that required a ----load of complexity I'm afraid. I have to first
    split each genre into disjoint sets, then create self-comparisons between
    those disjoint sets, as well as cross-comparisons between genres, and then
    finally compare the self-comparisons to the cross-comparisons.
    '''

    outmodels = '../results/reliable_models.tsv'
    outcomparisons = '../results/reliable_comparisons.tsv'
    columns = ['testype', 'name1', 'name2', 'ceiling', 'floor', 'meandate1', 'meandate2', 'acc1', 'acc2', 'alienacc1', 'alienacc2', 'spearman', 'spear1on2', 'spear2on1', 'loss', 'loss1on2', 'loss2on1']

    if not os.path.isfile(outcomparisons):
        with open(outcomparisons, mode = 'a', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, delimiter = '\t', fieldnames = columns)
            scribe.writeheader()

    if not os.path.isfile(outmodels):
        with open(outmodels, mode = 'a', encoding = 'utf-8') as f:
            outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n'
            f.write(outline)

    sourcefolder = '../data/'
    sizecap = 72

    c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
    featurestart = 1500
    featureend = 6500
    featurestep = 300
    modelparams = 'logistic', 15, featurestart, featureend, featurestep, c_range

    master = pd.read_csv('../metadata/mastermetadata.csv', index_col = 'docid')
    periods = [(1800, 1909), (1880, 1924), (1900, 1949), (1910, 1959), (1930, 1969), (1950, 1979), (1970, 1989), (1980, 1999), (1990, 2010)]
    forbiddenwords = {'fantasy', 'fiction', 'science', 'horror'}

    # endpoints both inclusive

    for i in range(15):
        for floor, ceiling in periods:

            split_metadata(master, floor, ceiling, sizecap)

            # That function just above does the real work of preventing leakage,
            # by splitting the genre into two disjoint sets. This allows self-
            # comparisons that avoid shared authors, and are thus strictly
            # comparable to cross-comparisons.

            metaoptions = ['sf1', 'sf2', 'fant1', 'fant2']

            for m in metaoptions:
                metadatapath = '../temp/' + m + '.csv'
                vocabpath = '../lexica/' + m + '.txt'
                name = 'temp_' + m + str(ceiling) + '_' + str(i)

                if m == 'sf1' or m == 'sf2':
                    tags4positive = {'sf_loc', 'sf_oclc', 'sf_bailey'}
                else:
                    tags4positive = {'fantasy_loc', 'fantasy_oclc', 'supernat'}

                tags4negative = {'random', 'randomB'}

                metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, forbid4positive = {'juv'}, forbid4negative = {'juv'}, force_even_distribution = False, numfeatures = 6500, forbiddenwords = forbiddenwords)

                matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../modeloutput/' + name + '.csv')

                meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

                with open(outmodels, mode = 'a', encoding = 'utf-8') as f:
                    outline = name + '\t' + str(sizecap) + '\t' + str(floor) + '\t' + str(ceiling) + '\t' + str(meandate) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\t' + str(i) + '\n'
                    f.write(outline)

                os.remove(vocabpath)

            r = dict()
            r['testype'] = 'sfself'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_sf2' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)

            r = dict()
            r['testype'] = 'fantasyself'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_fant1' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)

            r = dict()
            r['testype'] = 'cross'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_sf1' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_fant2' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)

            r = dict()
            r['testype'] = 'cross'
            r['ceiling'] = ceiling
            r['floor'] = floor
            r['name1'] = 'temp_sf2' + str(ceiling) + '_' + str(i)
            r['name2'] = 'temp_fant1' + str(ceiling) + '_' + str(i)
            r['spearman'], r['loss'], r['spear1on2'], r['spear2on1'], r['loss1on2'], r['loss2on1'], r['acc1'], r['acc2'], r['alienacc1'], r['alienacc2'], r['meandate1'], r['meandate2'] = get_divergence(r['name1'], r['name2'])
            write_a_row(r, outcomparisons, columns)
def create_cross_models():

    allgenres = set()
    meta = pd.read_csv('../genremeta.csv')
    for idx, row in meta.iterrows():
        genres = row.tags.split('|')
        for g in genres:
            allgenres.add(g)

    allgenres = list(allgenres)
    print(allgenres)

    for g in allgenres:
        print()
        print(g)
        print()
        sourcefolder = '../data/'
        sizecap = 100
        outmodels = '../results/crossmodels.tsv'

        c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
        featurestart = 1000
        featureend = 7000
        featurestep = 100
        modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range
        metadatapath = '../genremeta.csv'

        for contrast in ['randomA', 'randomB']:

            name = g + '_' + contrast
            vocabpath = '../lexica/' + name + '.txt'
            tags4positive = {g}
            tags4negative = {contrast}
            floor = 1700
            ceiling = 2011

            checkpath = '../models/' + name + '.csv'
            if not os.path.isfile(checkpath):

                metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(
                    sourcefolder,
                    metadatapath,
                    vocabpath,
                    tags4positive,
                    tags4negative,
                    sizecap,
                    excludebelow=floor,
                    excludeabove=ceiling,
                    force_even_distribution=False,
                    negative_strategy='closely match',
                    numfeatures=7000,
                    forbid4positive=set(),
                    forbid4negative=set())

                # notice that I am excluding children's lit this time!

                matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(
                    metadata, masterdata, classvector, classdictionary,
                    orderedIDs, authormatches, vocablist, tags4positive,
                    tags4negative, modelparams, name,
                    '../models/' + name + '.csv')

                meandate = int(
                    round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

                with open(outmodels, mode='a', encoding='utf-8') as f:
                    outline = name + '\t' + str(meandate) + '\t' + str(
                        maxaccuracy) + '\t' + str(features4max) + '\t' + str(
                            best_regularization_coef) + '\n'
                    f.write(outline)

                os.remove(vocabpath)
def create_cross_models():

    allgenres = set()
    meta = pd.read_csv('../genremeta.csv')
    for idx, row in meta.iterrows():
        genres = row.tags.split('|')
        for g in genres:
            allgenres.add(g)

    allgenres = list(allgenres)
    print(allgenres)

    for g in allgenres:
        print()
        print(g)
        print()
        sourcefolder = '../data/'
        sizecap = 100
        outmodels = '../results/crossmodels.tsv'

        c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
        featurestart = 1000
        featureend = 7000
        featurestep = 100
        modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range
        metadatapath = '../genremeta.csv'

        for contrast in ['randomA', 'randomB']:

            name = g + '_' + contrast
            vocabpath = '../lexica/' + name + '.txt'
            tags4positive = {g}
            tags4negative = {contrast}
            floor = 1700
            ceiling = 2011

            checkpath = '../models/' + name + '.csv'
            if not os.path.isfile(checkpath):

                metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, negative_strategy = 'closely match', numfeatures = 7000, forbid4positive = set(), forbid4negative = set())

                # notice that I am excluding children's lit this time!

                matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../models/' + name + '.csv')

                meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

                with open(outmodels, mode = 'a', encoding = 'utf-8') as f:
                    outline = name + '\t' + str(meandate) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\n'
                    f.write(outline)

                os.remove(vocabpath)
def implement_assignment(assignment_file):

    assignments = dict()

    with open(assignment_file, encoding = 'utf-8') as f:
        for line in f:
            row = line.strip().split('\t')

            name = row[0].replace(': ', '')
            name = name.replace(' ', '')
            name = name.replace(',', '')

            positive_genres = [row[0]]

            if row[1] != 'self':
                positive_genres.append(row[1])
                exclusion = row[1].split('-Not-')[1]
                excludename = exclusion.replace(' ', '')
                excludename = excludename.replace(':', '')
                excludename = excludename.replace(',', '')
                name = name + '-Not-' + excludename

            assignments[name] = positive_genres

    sourcefolder = '../data/'
    sizecap = 100
    outmodels = '../results/crossmodels.tsv'

    for posname, assigned_positives in assignments.items():
        print()
        print(name, assigned_positives)
        print()

        if len(assigned_positives) > 1:
            exclusion = assigned_positives[1].split('-Not-')[1]
            exclusionB = exclusion + ' B'
            set2exclude = {exclusion, exclusionB}

        else:
            set2exclude = set()

        c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
        featurestart = 500
        featureend = 6800
        featurestep = 100
        modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range
        metadatapath = '../metadata/genremeta.csv'

        for contrast in ['randomA', 'randomB']:

            name = posname + '_' + contrast
            vocabpath = '../lexica/' + name + '.txt'
            tags4positive = set(assigned_positives)

            tags4negative = {contrast}
            floor = 1700
            ceiling = 2011

            checkpath = '../models/' + name + '.csv'
            if not os.path.isfile(checkpath):

                metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, negative_strategy = 'closely match', numfeatures = 6900, forbid4positive = set2exclude, forbid4negative = set())

                matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../models/' + name + '.csv')

                meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

                with open(outmodels, mode = 'a', encoding = 'utf-8') as f:
                    outline = name + '\t' + str(meandate) + '\t' + str(maxaccuracy) + '\t' + str(features4max) + '\t' + str(best_regularization_coef) + '\n'
                    f.write(outline)

                os.remove(vocabpath)
def train_nonmodel():
    sourcefolder = 'samplematrix.csv'
    sizecap = 700

    c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
    featurestart = 1003
    featureend = 1103
    featurestep = 100
    modelparams = 'logistic', 12, featurestart, featureend, featurestep, c_range
    metadatapath = '../union_of_subsets.csv'

    name = 'nonmodel'
    vocabpath = 'dummyvariable'

    tags4positive = {'notfiction'}
    tags4negative = {'longfiction', 'shortfiction', 'juvenile', 'poetry', 'drama'}
    floor = 1800
    ceiling = 2011

    metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, forbid4positive = {'nothing'}, forbid4negative = {'nothing'}, force_even_distribution = False, negative_strategy = 'random', numfeatures = 7500)

    matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, 'output/' + name + '.csv')
Exemplo n.º 8
0
def first_experiment():

    sourcefolder = '../data/'
    metadatapath = '../metadata/mastermetadata.csv'
    vocabpath = '../modeloutput/experimentalvocab.txt'
    tags4positive = {'fantasy_loc', 'fantasy_oclc'}
    tags4negative = {'sf_loc', 'sf_oclc'}
    sizecap = 200

    metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(
        sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative,
        sizecap)

    c_range = [.004, .012, 0.3, 0.8, 2]
    featurestart = 3000
    featureend = 4400
    featurestep = 100
    modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range

    matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(
        metadata, masterdata, classvector, classdictionary, orderedIDs,
        authormatches, vocablist, tags4positive, tags4negative, modelparams,
        'first_experiment', '../modeloutput/first_experiment.csv')

    plt.rcParams["figure.figsize"] = [9.0, 6.0]
    plt.matshow(matrix, origin='lower', cmap=plt.cm.YlOrRd)
    plt.show()
def new_experiment():

    # The first time I ran this, I used partition 2 to build the
    # mixed data, and partition 1 as a gold standard. Now reversing.

    outmodelpath = '../measuredivergence/results/newexperimentmodels.csv'
    columns = ['name', 'size', 'ratio', 'iteration', 'meandate', 'maxaccuracy', 'features', 'regularization']
    if not os.path.isfile(outmodelpath):
        with open(outmodelpath, mode = 'w', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, fieldnames = columns)
            scribe.writeheader()

    c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
    featurestart = 1500
    featureend = 6000
    featurestep = 300
    modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range
    sizecap = 75

    for i in range(3, 6):
        for ratio in [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]:
            sourcefolder = '../measuredivergence/mix/' + str(ratio) + '/'
            metadatapath = '../measuredivergence/partitionmeta/meta' + str(ratio) + '.csv'
            name = 'mixeddata_' + str(i) + '_' + str(ratio)
            vocabpath = '../lexica/' + name + '.txt'
            tags4positive = {'fantasy', 'detective'}
            tags4negative = {'random'}
            floor = 1800
            ceiling = 1930

            metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000)

            matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv')

            meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

            row = dict()
            row['name'] = name
            row['size'] = sizecap
            row['ratio'] = ratio
            row['iteration'] = i
            row['meandate'] = meandate
            row['maxaccuracy'] = maxaccuracy
            row['features'] = features4max
            row['regularization'] = best_regularization_coef

            with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f:
                scribe = csv.DictWriter(f, fieldnames = columns)
                scribe.writerow(row)

            os.remove(vocabpath)

        sourcefolder = '../data/'
        metadatapath = '../measuredivergence/partitionmeta/part2.csv'
        # note that this is changed if you create mix data with
        # partition 2

        name = 'goldfantasy_' + str(i)
        vocabpath = '../lexica/' + name + '.txt'
        tags4positive = {'fantasy'}
        tags4negative = {'random', 'randomB'}
        floor = 1800
        ceiling = 1930

        metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000)

        matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv')

        meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

        row = dict()
        row['name'] = name
        row['size'] = sizecap
        row['ratio'] = ratio
        row['iteration'] = i
        row['meandate'] = meandate
        row['maxaccuracy'] = maxaccuracy
        row['features'] = features4max
        row['regularization'] = best_regularization_coef

        with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, fieldnames = columns)
            scribe.writerow(row)

        os.remove(vocabpath)

        sourcefolder = '../data/'
        metadatapath = '../measuredivergence/partitionmeta/part2.csv'
        # depending on which partition you used to create mix data;
        # this will be the other one

        name = 'golddetective_' + str(i)
        vocabpath = '../lexica/' + name + '.txt'
        tags4positive = {'detective'}
        tags4negative = {'random', 'randomB'}
        floor = 1800
        ceiling = 1930

        metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000)

        matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv')

        meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

        row = dict()
        row['name'] = name
        row['size'] = sizecap
        row['ratio'] = ratio
        row['iteration'] = i
        row['meandate'] = meandate
        row['maxaccuracy'] = maxaccuracy
        row['features'] = features4max
        row['regularization'] = best_regularization_coef

        with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, fieldnames = columns)
            scribe.writerow(row)

        os.remove(vocabpath)
def first_experiment():

    sourcefolder = '../data/'
    metadatapath = '../metadata/mastermetadata.csv'
    vocabpath = '../modeloutput/experimentalvocab.txt'
    tags4positive = {'fantasy_loc', 'fantasy_oclc'}
    tags4negative = {'sf_loc', 'sf_oclc'}
    sizecap = 200

    metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap)

    c_range = [.004, .012, 0.3, 0.8, 2]
    featurestart = 3000
    featureend = 4400
    featurestep = 100
    modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range

    matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, 'first_experiment', '../modeloutput/first_experiment.csv')

    plt.rcParams["figure.figsize"] = [9.0, 6.0]
    plt.matshow(matrix, origin = 'lower', cmap = plt.cm.YlOrRd)
    plt.show()
def new_experiment():

    # The first time I ran this, I used partition 2 to build the
    # mixed data, and partition 1 as a gold standard. Now reversing.

    outmodelpath = '../measuredivergence/results/newexperimentmodels.csv'
    columns = ['name', 'size', 'ratio', 'iteration', 'meandate', 'maxaccuracy', 'features', 'regularization']
    if not os.path.isfile(outmodelpath):
        with open(outmodelpath, mode = 'w', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, fieldnames = columns)
            scribe.writeheader()

    c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
    featurestart = 1500
    featureend = 6000
    featurestep = 300
    modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range
    sizecap = 75

    for i in range(3, 6):
        for ratio in [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]:
            sourcefolder = '../measuredivergence/mix/' + str(ratio) + '/'
            metadatapath = '../measuredivergence/partitionmeta/meta' + str(ratio) + '.csv'
            name = 'mixeddata_' + str(i) + '_' + str(ratio)
            vocabpath = '../lexica/' + name + '.txt'
            tags4positive = {'fantasy', 'detective'}
            tags4negative = {'random'}
            floor = 1800
            ceiling = 1930

            metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000)

            matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv')

            meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

            row = dict()
            row['name'] = name
            row['size'] = sizecap
            row['ratio'] = ratio
            row['iteration'] = i
            row['meandate'] = meandate
            row['maxaccuracy'] = maxaccuracy
            row['features'] = features4max
            row['regularization'] = best_regularization_coef

            with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f:
                scribe = csv.DictWriter(f, fieldnames = columns)
                scribe.writerow(row)

            os.remove(vocabpath)

        sourcefolder = '../data/'
        metadatapath = '../measuredivergence/partitionmeta/part2.csv'
        # note that this is changed if you create mix data with
        # partition 2

        name = 'goldfantasy_' + str(i)
        vocabpath = '../lexica/' + name + '.txt'
        tags4positive = {'fantasy'}
        tags4negative = {'random', 'randomB'}
        floor = 1800
        ceiling = 1930

        metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000)

        matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv')

        meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

        row = dict()
        row['name'] = name
        row['size'] = sizecap
        row['ratio'] = ratio
        row['iteration'] = i
        row['meandate'] = meandate
        row['maxaccuracy'] = maxaccuracy
        row['features'] = features4max
        row['regularization'] = best_regularization_coef

        with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, fieldnames = columns)
            scribe.writerow(row)

        os.remove(vocabpath)

        sourcefolder = '../data/'
        metadatapath = '../measuredivergence/partitionmeta/part2.csv'
        # depending on which partition you used to create mix data;
        # this will be the other one

        name = 'golddetective_' + str(i)
        vocabpath = '../lexica/' + name + '.txt'
        tags4positive = {'detective'}
        tags4negative = {'random', 'randomB'}
        floor = 1800
        ceiling = 1930

        metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(sourcefolder, metadatapath, vocabpath, tags4positive, tags4negative, sizecap, excludebelow = floor, excludeabove = ceiling, force_even_distribution = False, numfeatures = 6000)

        matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist, tags4positive, tags4negative, modelparams, name, '../measuredivergence/newmodeloutput/' + name + '.csv')

        meandate = int(round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

        row = dict()
        row['name'] = name
        row['size'] = sizecap
        row['ratio'] = ratio
        row['iteration'] = i
        row['meandate'] = meandate
        row['maxaccuracy'] = maxaccuracy
        row['features'] = features4max
        row['regularization'] = best_regularization_coef

        with open(outmodelpath, mode = 'a', encoding = 'utf-8') as f:
            scribe = csv.DictWriter(f, fieldnames = columns)
            scribe.writerow(row)

        os.remove(vocabpath)
Exemplo n.º 12
0
def repeatedly_model(modelname, tags4positive, tags4negative, sizecap,
                     sourcefolder, metadatapath):

    outmodels = '../results/' + modelname + '_models.tsv'

    if not os.path.isfile(outmodels):
        with open(outmodels, mode='w', encoding='utf-8') as f:
            outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n'
            f.write(outline)

    for i in range(10):
        name = modelname + str(i)
        vocabpath = '../lexica/' + name + '.txt'

        c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
        featurestart = 200
        featureend = 5200
        featurestep = 200
        modelparams = 'logistic', 10, featurestart, featureend, featurestep, c_range
        forbiddenwords = {}
        floor = 1700
        ceiling = 2020

        metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(
            sourcefolder,
            metadatapath,
            vocabpath,
            tags4positive,
            tags4negative,
            sizecap,
            extension='.fic.tsv',
            excludebelow=floor,
            excludeabove=ceiling,
            forbid4positive={'juv'},
            forbid4negative={'juv'},
            force_even_distribution=False,
            numfeatures=6000,
            forbiddenwords=forbiddenwords)

        matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(
            metadata, masterdata, classvector, classdictionary, orderedIDs,
            authormatches, vocablist, tags4positive, tags4negative,
            modelparams, name, '../modeloutput/' + name + '.csv')

        meandate = int(
            round(np.sum(metadata.firstpub) / len(metadata.firstpub)))
        floor = np.min(metadata.firstpub)
        ceiling = np.max(metadata.firstpub)

        with open(outmodels, mode='a', encoding='utf-8') as f:
            outline = name + '\t' + str(sizecap) + '\t' + str(
                floor) + '\t' + str(ceiling) + '\t' + str(
                    meandate) + '\t' + str(maxaccuracy) + '\t' + str(
                        features4max) + '\t' + str(
                            best_regularization_coef) + '\t' + str(i) + '\n'
            f.write(outline)

        os.remove(vocabpath)
Exemplo n.º 13
0
def create_variant_models(modelname, tags4positive, tags4negative, splityear):
    '''
    Creates variant models that are then used by measure_parallax.
    '''

    outmodels = '../results/' + modelname + '_models.tsv'
    columns = [
        'testype', 'name1', 'name2', 'ceiling1', 'floor1', 'ceiling2',
        'floor2', 'meandate1', 'meandate2', 'acc1', 'acc2', 'alienacc1',
        'alienacc2', 'spearman', 'spear1on2', 'spear2on1', 'loss', 'loss1on2',
        'loss2on1'
    ]

    if not os.path.isfile(outmodels):
        with open(outmodels, mode='a', encoding='utf-8') as f:
            outline = 'name\tsize\tfloor\tceiling\tmeandate\taccuracy\tfeatures\tregularization\ti\n'
            f.write(outline)

    sourcefolder = '../newdata/'
    sizecap = 75

    c_range = [.00001, .0001, .001, .01, 0.1, 1, 10, 100]
    featurestart = 1000
    featureend = 6500
    featurestep = 300
    modelparams = 'logistic', 15, featurestart, featureend, featurestep, c_range

    master = pd.read_csv('../meta/finalmeta.csv', index_col='docid')
    forbiddenwords = {}

    periods = [(1700, splityear - 1), (splityear, 2010)]

    for i in range(10):
        for floor, ceiling in periods:

            name = modelname + str(floor) + '_' + str(ceiling) + '_' + str(i)

            names = []

            names.append(name)

            metadatapath = '../meta/finalmeta.csv'
            vocabpath = '../lexica/' + name + '.txt'

            metadata, masterdata, classvector, classdictionary, orderedIDs, authormatches, vocablist = versatiletrainer2.get_simple_data(
                sourcefolder,
                metadatapath,
                vocabpath,
                tags4positive,
                tags4negative,
                sizecap,
                extension='.fic.tsv',
                excludebelow=floor,
                excludeabove=ceiling,
                forbid4positive={'juv'},
                forbid4negative={'juv'},
                force_even_distribution=False,
                numfeatures=6500,
                forbiddenwords=forbiddenwords)

            matrix, maxaccuracy, metadata, coefficientuples, features4max, best_regularization_coef = versatiletrainer2.tune_a_model(
                metadata, masterdata, classvector, classdictionary, orderedIDs,
                authormatches, vocablist, tags4positive, tags4negative,
                modelparams, name, '../modeloutput/' + name + '.csv')

            meandate = int(
                round(np.sum(metadata.firstpub) / len(metadata.firstpub)))

            with open(outmodels, mode='a', encoding='utf-8') as f:
                outline = name + '\t' + str(sizecap) + '\t' + str(
                    floor) + '\t' + str(ceiling) + '\t' + str(
                        meandate) + '\t' + str(maxaccuracy) + '\t' + str(
                            features4max) + '\t' + str(best_regularization_coef
                                                       ) + '\t' + str(i) + '\n'
                f.write(outline)

            os.remove(vocabpath)