예제 #1
0
def project_tags(tagstoproject, tagtargets):

    targetstring = ','.join(tagtargets)
    projectstring = ','.join(tagstoproject)

    print('First we create a model of ' + targetstring)

    sizecap = 400

    modelname = targetstring + 'byitself'
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths

    exclusions = make_exclusions(0, 2000, sizecap, tagstoproject)
    # Note that we exclude tagstoproject from the negative contrast set, so the
    # contrast sets for the two models will be identical.

    positive_tags = tagtargets
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    print()
    print('Then we create a model of ' + projectstring + ' and use it to predict ' + targetstring)

    modelname = projectstring + 'predicts' + targetstring
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths

    exclusions = make_exclusions(0, 2000, sizecap, 'nonegatives')

    positive_tags = list(tagtargets)
    positive_tags.extend(tagstoproject)
    testconditions = set(tagtargets)
    # That's the line that actually excludes tagtarget from training.

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)
예제 #2
0
def project_gothic_beyond_date(dividedate):

    print('First we create a model of gothic fiction only after ' + str(dividedate))

    sizecap = 300

    modelname = 'gothicjustpost' + str(dividedate)
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath1, vocabpath = paths

    exclusions = make_exclusions(dividedate, 2000, sizecap, 'nonegatives')

    positive_tags = ['lochorror', 'pbgothic', 'locghost', 'stangothic', 'chihorror']
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    print()
    print('Then we create a model of gothic fiction blindly predicting after ' + str(dividedate))

    modelname = 'gothicpredictpost' + str(dividedate)
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath2, vocabpath = paths

    exclusions = make_exclusions(0, 2001, sizecap, 'nonegatives')

    testconditions = {'1700', str(dividedate)}

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)
예제 #3
0
def the_red_and_the_black():

    sizecap = 140

    modelname = 'blackandthered'
    paths = make_paths(modelname)

    exclusions = make_exclusions(1700, 2001, sizecap, 'nonegatives')

    positive_tags = ['teamred']
    negative_tags = ['teamblack']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    accuracies = []
    for i in range(40):

        modelname = 'redandtheblack' + str(i)
        paths = make_paths(modelname)

        rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)
        print(rawaccuracy)
        accuracies.append(rawaccuracy)

    with open('finalaccuracies.csv', mode = 'w', encoding = 'utf-8') as f:
        for accuracy in accuracies:
            f.write(str(accuracy) + '\n')
예제 #4
0
def model_taglist_within_dates(positive_tags, modelname, mindate, maxdate):
    print('We are modeling these positive tags:')
    for tag in positive_tags:
        print(tag)

    sizecap = 1000
    paths = make_paths(modelname)
    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths

    exclusions = make_exclusions(mindate, maxdate, sizecap, 'nonegatives')

    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))

    return allvolumes
예제 #5
0
def generic():
    sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/'
    extension = '.tsv'
    metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv'
    vocabpath = '/Users/tunder/Dropbox/character/meta/predict1960-79vocab.csv'

    modelname = input('Name of model? ')
    outputpath = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    daterange = input('Range of dates to use in the model? ')
    if ',' in daterange:
        dates = [int(x.strip()) for x in daterange.split(',')]
        dates.sort()
        if len(dates) == 2:
            assert dates[0] < dates[1]
            excludebelow['firstpub'] = dates[0]
            excludeabove['firstpub'] = dates[1]

    sizecap = 1000

    # CLASSIFY CONDITIONS

    # We ask the user for a list of categories to be included in the positive
    # set, as well as a list for the negative set. Default for the negative set
    # is to include all the "random"ly selected categories. Note that random volumes
    # can also be tagged with various specific genre tags; they are included in the
    # negative set only if they lack tags from the positive set.

    positive_tags = ['f']
    negative_tags = ['m']

    datetype = "firstpub"
    numfeatures = 1700
    regularization = .00011
    testconditions = set()

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))

    tiltaccuracy = logisticpredict.diachronic_tilt(allvolumes, 'linear', [])

    print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))
예제 #6
0
def calibrate_detective():
    '''
    Tests accuracy of classification for detective fiction at different sample
    sizes.
    '''

    modelname = 'calibratedet'
    paths = make_paths(modelname)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = 1700
    excludeabove['firstpub'] = 2020


    positive_tags = ['locdetective', 'locdetmyst', 'chimyst', 'locdetmyst', 'det100']
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    sizes = [5,6,7,8,9,11,13,15,17,18,21,27,29,32,34,36,40,45,50,55,60,65,70,75,80,85,90,100]

    # with open('../results/collateddetectiveaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f:
    #         f.write('sizecap\tavgsize\trawaccuracy\n')

    accuracies = []
    for sizecap in sizes:

        exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

        rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

        trainsizes = []
        for vol in allvolumes:
            trainsizes.append(vol[11])
            # this is unfortunately dependent on the exact way
            # logisticpredict formats its output

        avgsize = sum(trainsizes) / len(trainsizes)

        print(sizecap, avgsize, rawaccuracy)
        with open('../final/collateddetaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f:
            f.write(str(sizecap) + '\t' + str(avgsize) + '\t' + str(rawaccuracy) + '\n')

    return None
예제 #7
0
def replicate_stew():

    sizecap = 140

    modelname = 'replicatestew'
    paths = make_paths(modelname)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = 1700
    excludeabove['firstpub'] = 2020

    allstewgenres = {'cozy', 'hardboiled', 'det100', 'chimyst', 'locdetective', 'lockandkey', 'crime', 'locdetmyst', 'blcrime', 'anatscifi', 'locscifi', 'chiscifi', 'femscifi', 'stangothic', 'pbgothic', 'lochorror', 'chihorror', 'locghost'}

    # We have to explicitly exclude genres because the category "stew" in the
    # positive category wouldn't otherwise automatically exclude the constituent
    # tags that were used to create it.

    # I would just have put all those tags in the positive tag list, but then you'd lose
    # the ability to explicitly balance equal numbers of crime, gothic,
    # and science fiction, plus sensation novels. You'd get a list dominated by
    # the crime categories, which are better-represented in the dataset.

    excludeif['negatives'] = allstewgenres
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    positive_tags = ['stew']
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    accuracies = []
    for i in range(20):

        rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)
        print(rawaccuracy)
        accuracies.append(rawaccuracy)

    with open('stewaccuracies.csv', mode = 'a', encoding = 'utf-8') as f:
        for accuracy in accuracies:
            f.write(str(accuracy) + '\n')
예제 #8
0
def predictecco():
    sourcefolder = '../sourcefiles/'
    extension = '.tsv'
    metadatapath = '../metadata/eccogenremeta.csv'

    modelname = 'predictecco'
    outputpath = '../modeloutput/' + modelname + str(datetime.date.today()) + '.csv'
    vocabpath = '../lexicons/' + modelname + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = 1700
    excludeabove['firstpub'] = 1800

    sizecap = 75

    # CLASSIFY CONDITIONS

    positive_tags = ['fic']
    negative_tags = ['bio']
    # testconditions = {'0', '2001', 'poe'}
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 1100
    regularization = .015

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    with open('../modeloutput/eccopredicts.csv', mode = 'w', encoding = 'utf-8') as f:
        f.write('row,accuracy,auc\n')

    for i in range(15):
        rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)
        auc = calculate_auc(allvolumes)

        print(i, rawaccuracy, auc)
        with open('../modeloutput/eccopredicts.csv', mode = 'a', encoding = 'utf-8') as f:
            f.write(str(i) + ',' + str(rawaccuracy) + ',' + str(auc) + '\n')
예제 #9
0
def compare(dividedate):

    print('First we create a model of gender only after ' + str(dividedate))

    sizecap = 500

    modelname = 'post' + str(dividedate)
    sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/'
    extension = '.tsv'
    metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv'
    vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv'
    outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str(
        datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()
    excludebelow['firstpub'] = 1900
    excludeabove['firstpub'] = 1950
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    positive_tags = ['f']
    negative_tags = ['m']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 2000
    regularization = .00009

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures,
                          regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(
        paths, exclusions, classifyconditions)

    print(
        'If we divide the dataset with a horizontal line at 0.5, accuracy is: ',
        str(rawaccuracy))
    print()
    print(
        'Then we create a model of detective fiction blindly predicting after '
        + str(dividedate))

    modelname = 'predictpost' + str(dividedate)
    outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str(
        datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath)

    excludebelow['firstpub'] = 1780
    excludeabove['firstpub'] = 2000
    sizecap = 1000
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    testconditions = {'1700', 1880}

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures,
                          regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(
        paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ',
          str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)
예제 #10
0
def generic():
    sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/'
    extension = '.tsv'
    metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv'
    vocabpath = '/Users/tunder/Dropbox/character/meta/predict1960-79vocab.csv'

    modelname = input('Name of model? ')
    outputpath = '/Users/tunder/Dropbox/character/results/' + modelname + str(
        datetime.date.today()) + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    daterange = input('Range of dates to use in the model? ')
    if ',' in daterange:
        dates = [int(x.strip()) for x in daterange.split(',')]
        dates.sort()
        if len(dates) == 2:
            assert dates[0] < dates[1]
            excludebelow['firstpub'] = dates[0]
            excludeabove['firstpub'] = dates[1]

    sizecap = 1000

    # CLASSIFY CONDITIONS

    # We ask the user for a list of categories to be included in the positive
    # set, as well as a list for the negative set. Default for the negative set
    # is to include all the "random"ly selected categories. Note that random volumes
    # can also be tagged with various specific genre tags; they are included in the
    # negative set only if they lack tags from the positive set.

    positive_tags = ['f']
    negative_tags = ['m']

    datetype = "firstpub"
    numfeatures = 1700
    regularization = .00011
    testconditions = set()

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures,
                          regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(
        paths, exclusions, classifyconditions)

    print(
        'If we divide the dataset with a horizontal line at 0.5, accuracy is: ',
        str(rawaccuracy))

    tiltaccuracy = logisticpredict.diachronic_tilt(allvolumes, 'linear', [])

    print("Divided with a line fit to the data trend, it's ",
          str(tiltaccuracy))
예제 #11
0
def predictfic():

    ## PATHS.

    sourcefolder = '../sourcefiles/'
    extension = '.tsv'
    metadatapath = 'allgenremeta.csv'

    for floor in range(1720, 1920, 10):
        ceiling = floor + 19

        modelname = 'predictfic' + str(floor)
        outputpath = '../results/' + modelname + str(datetime.date.today()) + '.csv'
        vocabpath = 'vocab/predictfic' + str(floor) + '.txt'

        # We can simply exclude volumes from consideration on the basis on any
        # metadata category we want, using the dictionaries defined below.

        ## EXCLUSIONS.

        excludeif = dict()
        excludeifnot = dict()
        excludeabove = dict()
        excludebelow = dict()

        excludebelow['firstpub'] = floor
        excludeabove['firstpub'] = ceiling

        # We have to explicitly exclude genres because the category "stew" in the
        # positive category wouldn't otherwise automatically exclude the constituent
        # tags that were used to create it.

        # I would just have put all those tags in the positive tag list, but then you'd lose
        # the ability to explicitly balance equal numbers of crime, gothic,
        # and science fiction, plus sensation novels. You'd get a list dominated by
        # the crime categories, which are better-represented in the dataset.

        sizecap = 100

        # CLASSIFY CONDITIONS

        # We ask the user for a list of categories to be included in the positive
        # set, as well as a list for the negative set. Default for the negative set
        # is to include all the "random"ly selected categories. Note that random volumes
        # can also be tagged with various specific genre tags; they are included in the
        # negative set only if they lack tags from the positive set.

        positive_tags = ['fic']
        negative_tags = ['bio']
        #testconditions = {'0', '2001', 'poe'}
        testconditions = set()

        datetype = "firstpub"
        numfeatures = 3000
        regularization = .00008

        paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
        exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
        classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

        rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

        with open('../results/ficpredicts.csv', mode = 'a', encoding = 'utf-8') as f:
            f.write(str(floor) + ',' + str(rawaccuracy) + '\n')
예제 #12
0
def predict2017test():

    ## PATHS.

    sourcefolder = '/Users/tunder/Dropbox/python/nonfic/2017test/'
    extension = '.tsv'
    metadatapath = '2017testmeta.csv'

    vocabpath = 'vocab/2017testvocab.txt'

    modelname = '2017testmodel'
    outputpath = '../results/' + modelname + str(datetime.date.today()) + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = 1700
    excludeabove['firstpub'] = 2000

    # We have to explicitly exclude genres because the category "stew" in the
    # positive category wouldn't otherwise automatically exclude the constituent
    # tags that were used to create it.

    # I would just have put all those tags in the positive tag list, but then you'd lose
    # the ability to explicitly balance equal numbers of crime, gothic,
    # and science fiction, plus sensation novels. You'd get a list dominated by
    # the crime categories, which are better-represented in the dataset.

    sizecap = 400

    # CLASSIFY CONDITIONS

    # We ask the user for a list of categories to be included in the positive
    # set, as well as a list for the negative set. Default for the negative set
    # is to include all the "random"ly selected categories. Note that random volumes
    # can also be tagged with various specific genre tags; they are included in the
    # negative set only if they lack tags from the positive set.

    positive_tags = ['fic']
    negative_tags = ['bio']
    # testconditions = {'0', '2001', 'poe', 'limit==250'}
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 3200
    regularization = .00008

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))

    tiltaccuracy = logisticpredict.diachronic_tilt(allvolumes, 'linear', [])

    print("Divided with a line fit to the data trend, it's ", str(tiltaccuracy))

    return allvolumes
예제 #13
0
def calibrate_stew():
    '''
    Tests accuracy of classification for ghastly stew at different sample
    sizes.
    '''

    modelname = 'calibratestew'
    paths = make_paths(modelname)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = 1700
    excludeabove['firstpub'] = 2020


    allstewgenres = {'cozy', 'hardboiled', 'det100', 'chimyst', 'locdetective', 'lockandkey', 'crime', 'locdetmyst', 'blcrime', 'anatscifi', 'locscifi', 'chiscifi', 'femscifi', 'stangothic', 'pbgothic', 'lochorror', 'chihorror', 'locghost'}

    # We have to explicitly exclude genres because the category "stew" in the
    # positive category wouldn't otherwise automatically exclude the constituent
    # tags that were used to create it.

    # I would just have put all those tags in the positive tag list, but then you'd lose
    # the ability to explicitly balance equal numbers of crime, gothic,
    # and science fiction, plus sensation novels. You'd get a list dominated by
    # the crime categories, which are better-represented in the dataset.

    excludeif['negatives'] = allstewgenres

    positive_tags = ['stew']
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    sizes = [5,6,7,8,9,11,13,15,17,18,21,27,29,32,34,36,40,45,50,55,60,65,70,75,80,85,90,100]

    # with open('../results/collatedstewaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f:
    #         f.write('sizecap\tavgsize\trawaccuracy\n')

    accuracies = []
    for sizecap in sizes:

        exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

        rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

        trainsizes = []
        for vol in allvolumes:
            trainsizes.append(vol[11])
            # this is unfortunately dependent on the exact way
            # logisticpredict formats its output

        avgsize = sum(trainsizes) / len(trainsizes)

        print(sizecap, avgsize, rawaccuracy)
        with open('../final/collatedstewaccuracies.tsv', mode = 'a', encoding = 'utf-8') as f:
            f.write(str(sizecap) + '\t' + str(avgsize) + '\t' + str(rawaccuracy) + '\n')

    return None
예제 #14
0
def ghastly_stew():

    ## PATHS.

    sourcefolder = '../newdata/'
    extension = '.fic.tsv'
    metadatapath = '../meta/finalmeta.csv'
    vocabpath = '../lexicon/new10k.csv'

    modelname = 'ghastlystew'
    outputpath = '../results/' + modelname + str(datetime.date.today()) + '.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['firstpub'] = 1700
    excludeabove['firstpub'] = 2020

    allstewgenres = {'cozy', 'hardboiled', 'det100', 'chimyst', 'locdetective', 'lockandkey', 'crime', 'locdetmyst', 'blcrime', 'anatscifi', 'locscifi', 'chiscifi', 'femscifi', 'stangothic', 'pbgothic', 'lochorror', 'chihorror', 'locghost'}

    # We have to explicitly exclude genres because the category "stew" in the
    # positive category wouldn't otherwise automatically exclude the constituent
    # tags that were used to create it.

    # I would just have put all those tags in the positive tag list, but then you'd lose
    # the ability to explicitly balance equal numbers of crime, gothic,
    # and science fiction, plus sensation novels. You'd get a list dominated by
    # the crime categories, which are better-represented in the dataset.

    excludeif['negatives'] = allstewgenres
    sizecap = 250

    # CLASSIFY CONDITIONS

    # We ask the user for a list of categories to be included in the positive
    # set, as well as a list for the negative set. Default for the negative set
    # is to include all the "random"ly selected categories. Note that random volumes
    # can also be tagged with various specific genre tags; they are included in the
    # negative set only if they lack tags from the positive set.

    positive_tags = ['stew']
    negative_tags = ['random', 'chirandom']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 10000
    regularization = .000075

    paths = (sourcefolder, extension, metadatapath, outputpath, vocabpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
예제 #15
0
def compare(dividedate):

    print('First we create a model of gender only after ' + str(dividedate))

    sizecap = 500

    modelname = 'post' + str(dividedate)
    sourcefolder = '/Volumes/TARDIS/work/characterdata/charpredict/'
    extension = '.tsv'
    metadatapath = '/Users/tunder/Dropbox/character/meta/predictmeta.csv'
    vocabpath = '/Users/tunder/Dropbox/character/meta/predictALLvocab.csv'
    outputpath1 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath1, vocabpath)

    ## EXCLUSIONS.

    excludeif = dict()
    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()
    excludebelow['firstpub'] = 1900
    excludeabove['firstpub'] = 1950
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    positive_tags = ['f']
    negative_tags = ['m']
    testconditions = set()

    datetype = "firstpub"
    numfeatures = 2000
    regularization = .00009

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the dataset with a horizontal line at 0.5, accuracy is: ', str(rawaccuracy))
    print()
    print('Then we create a model of detective fiction blindly predicting after ' + str(dividedate))

    modelname = 'predictpost' + str(dividedate)
    outputpath2 = '/Users/tunder/Dropbox/character/results/' + modelname + str(datetime.date.today()) + '.csv'
    paths = (sourcefolder, extension, metadatapath, outputpath2, vocabpath)

    excludebelow['firstpub'] = 1780
    excludeabove['firstpub'] = 2000
    sizecap = 1000
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)

    testconditions = {'1700', 1880}

    classifyconditions = (positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions)

    rawaccuracy, allvolumes, coefficientuples = logisticpredict.create_model(paths, exclusions, classifyconditions)

    print('If we divide the second dataset at 0.5, accuracy is: ', str(rawaccuracy))
    print()

    # Now we compare the predictions made by these two models, comparing only
    # the volumes that are in both models but excluded from the training process
    # in the second model.

    comparemodels.compare_untrained(outputpath1, outputpath2)