Пример #1
0
def get_donttrainset(all_positives, positive_tags, metadict,
                     donttrainconditions, datetype):
    '''
    This function identifies positive volumes that are not to be included in a training set,
    because they belong to a category that is being tested only.
    '''

    donttrainset = set()

    pastthreshold, futurethreshold = get_thresholds(donttrainconditions)

    for posvol in all_positives:

        date = metautils.infer_date(metadict[posvol], datetype)
        if date < pastthreshold or date > futurethreshold:
            donttrainset.add(posvol)
            continue

        tagset = metadict[posvol]['tagset']
        hasexclusion = False
        hasotherpositive = False

        for tag in positive_tags:
            if tag in tagset and not tag in donttrainconditions:
                hasotherpositive = True

        for tag in donttrainconditions:
            if tag in tagset:
                hasexclusion = True

        if hasexclusion and not hasotherpositive:
            donttrainset.add(posvol)

    # The following paragraph allows us to limit the size of the
    # donttrainset by including a tag like "limit==250"
    for tag in donttrainconditions:
        if 'limit==' in tag:
            limit = int(tag.replace('limit==', ''))
            if limit < len(donttrainset):
                donttrainset = set(random.sample(donttrainset, limit))

    return donttrainset
def get_donttrainset(all_positives, positive_tags, metadict, donttrainconditions, datetype):
    '''
    This function identifies positive volumes that are not to be included in a training set,
    because they belong to a category that is being tested only.
    '''

    donttrainset = set()

    pastthreshold, futurethreshold = get_thresholds(donttrainconditions)

    for posvol in all_positives:

        date = metautils.infer_date(metadict[posvol], datetype)
        if date < pastthreshold or date > futurethreshold:
            donttrainset.add(posvol)
            continue

        tagset = metadict[posvol]['tagset']
        hasexclusion = False
        hasotherpositive = False

        for tag in positive_tags:
            if tag in tagset and not tag in donttrainconditions:
                hasotherpositive = True

        for tag in donttrainconditions:
            if tag in tagset:
                hasexclusion = True

        if hasexclusion and not hasotherpositive:
            donttrainset.add(posvol)

    # The following paragraph allows us to limit the size of the
    # donttrainset by including a tag like "limit==250"
    for tag in donttrainconditions:
        if 'limit==' in tag:
            limit = int(tag.replace('limit==', ''))
            if limit < len(donttrainset):
                donttrainset = set(random.sample(donttrainset, limit))

    return donttrainset
Пример #3
0
def create_model(paths, exclusions, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions

    verbose = False
    holdout_authors = True

    # If you want reliable results, always run this with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference. If you do that, it
    # disables the code that keeps other works by the author being predicted
    # out of the training set.

    # The following function confirms that the testconditions are legal.

    confirm_testconditions(testconditions, positive_tags)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast.

    if type(positive_tags[0]).__name__ == 'int':
        categorytodivide = 'firstpub'
    else:
        categorytodivide = 'tagset'

    IDsToUse, classdictionary, donttrainset = metafilter.label_classes(
        metadict, categorytodivide, positive_tags, negative_tags, sizecap,
        datetype, excludeif, testconditions)

    print()
    min, max = first_and_last(IDsToUse, metadict, datetype)
    if min > 0:
        print("The whole corpus involved here includes " + str(len(IDsToUse)))
        print("volumes, ranging in date from " + str(min) + " to " + str(max) +
              ".")
        print()

    # We now create an ordered list of id-path tuples for later use, and identify a set of
    # positive ids that should never be used in training.

    volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths,
                                               IDsToUse)

    # Extend the set of ids not to be used in training by identifying negative volumes that match
    # the distribution of positive volumes.

    describe_donttrainset(donttrainset, classdictionary, metadict, datetype)

    # Create a flag for each volume that indicates whether it was used in training

    record_trainflags(metadict, donttrainset)

    # Get a count of docfrequency for all words in the corpus. This is probably not needed and
    # might be deprecated later.

    wordcounts = get_docfrequency(volspresent, donttrainset)

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.

    vocablist = get_vocablist(vocabpath,
                              volspresent,
                              wordcounts,
                              useall=True,
                              n=numfeatures)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top 10k words in all files, and stores it there.
    # N is a parameter that could be altered right here.

    # Useall is a parameter that you basically don't need to worry about unless
    # you're changing / testing code. If you set it to false, the vocablist will
    # exclude words that occur very rarely. This shouldn't be necessary; the
    # crossvalidation routine is designed not to include features that occur
    # zero times in the training set. But if you get div-by-zero errors in the
    # training process, you could fiddle with this parameter as part of a
    # troubleshooting process.

    numfeatures = len(vocablist)

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    # This list will include for ALL volumes, the indexes of vols in the donttrainset.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadict[anid]['author']
            for idx2, anotherid in enumerate(orderedIDs):
                otherauthor = metadict[anotherid]['author']
                if thisauthor == otherauthor and not idx2 in authormatches[
                        idx1]:
                    authormatches[idx1].append(idx2)
    else:
        # This code only runs if we're testing the effect of
        # holdout_authors by disabling it.

        for idx1, anid in enumerate(orderedIDs):
            if idx1 not in authormatches[idx1]:
                authormatches[idx1].append(idx1)

    # The purpose of everything that follows is to
    # balance negative and positive instances in each
    # training set.

    trainingpositives = set()
    trainingnegatives = set()

    for anid, thisclass in classdictionary.items():
        if anid in donttrainset:
            continue

        if thisclass == 1:
            trainingpositives.add(orderedIDs.index(anid))
        else:
            trainingnegatives.add(orderedIDs.index(anid))

    print('Training positives: ' + str(len(trainingpositives)))
    print('Training negatives: ' + str(len(trainingnegatives)))

    # The code below was intended to balance the size of positive and
    # negative in spite of same-author exclusions. But it could
    # have grossly unintended effects when there were many donttrainon
    # exclusions.

    # for alist in authormatches:
    #     numpositive = 0
    #     numnegative = 0
    #     for anidx in alist:
    #         anid = orderedIDs[anidx]
    #         thisclass = classdictionary[anid]
    #         if thisclass == 1:
    #             numpositive += 1
    #         else:
    #             numnegative += 1

    #     if numpositive > numnegative:
    #         difference = numpositive - numnegative
    #         remaining = trainingnegatives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     elif numpositive < numnegative:
    #         difference = numnegative - numpositive
    #         remaining = trainingpositives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     else:
    #         difference = 0

    # Let's record, for each volume, the size of its training set.

    trainingsizes = []

    numvolumes = len(orderedIDs)
    for idx, anid in enumerate(orderedIDs):
        excluded = len(authormatches[idx])
        metadict[anid]['trainsize'] = numvolumes - excluded
        trainingsizes.append(metadict[anid]['trainsize'])

    averagetrainingsize = sum(trainingsizes) / len(trainingsizes)

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding='utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = metautils.infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date,
                                              totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            if totalcount == 0:
                totalcount = .00001
            voldata.append(features / totalcount)

        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes=11)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        header = [
            'volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender',
            'nation', 'allwords', 'logistic', 'realclass', 'trainflag',
            'trainsize', 'author', 'title', 'genretags'
        ]
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            dateused = metadata[datetype]
            pubdate = metadata['pubdate']
            birthdate = metadata['birthdate']
            firstpub = metadata['firstpub']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            trainflag = metadata['trainflag']
            trainsize = metadata['trainsize']
            genretags = ' | '.join(metadata['tagset'])
            outrow = [
                volid, dateused, pubdate, birthdate, firstpub, gender, nation,
                allwords, logistic, realclass, trainflag, trainsize, author,
                title, genretags
            ]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic == 0.5:
                print("equals!")
                predictedpositive = random.sample([True, False], 1)[0]
            elif logistic > 0.5:
                predictedpositive = True
            elif logistic < 0.5:
                predictedpositive = False
            else:
                print('Oh, joy. A fundamental floating point error.')
                predictedpositive = random.sample([True, False], 1)[0]

            if predictedpositive and classdictionary[volid] > 0.5:
                truepositives += 1
            elif not predictedpositive and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif not predictedpositive and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif predictedpositive and classdictionary[volid] < 0.5:
                falsepositives += 1
            else:
                print("Wait a second, boss.")

    donttrainon.sort(reverse=True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    trainingset, testset = modelingprocess.remove_zerocols(
        trainingset, testset)
    newmodel = LogisticRegression(C=regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(
        zip(coefficients, (coefficients / np.array(stdevs)),
            vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives
    if totalevaluated != len(IDsToUse):
        print("Total evaluated = " + str(totalevaluated))
        print("But we've got " + str(len(IDsToUse)))
    accuracy = (truepositives + truenegatives) / totalevaluated
    print('True positives ' + str(truepositives))
    print('True negatives ' + str(truenegatives))
    print('False positives ' + str(falsepositives))
    print('False negatives ' + str(falsenegatives))

    print()
    print('The average size of the training set was ' +
          str(averagetrainingsize))
    print()

    precision = truepositives / (truepositives + falsepositives)
    recall = truepositives / (truepositives + falsenegatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    print("F1 : " + str(F1))

    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
def create_model(paths, exclusions, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions

    verbose = False
    holdout_authors = True

    # If you want reliable results, always run this with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference. If you do that, it
    # disables the code that keeps other works by the author being predicted
    # out of the training set.

    # The following function confirms that the testconditions are legal.

    confirm_testconditions(testconditions, positive_tags)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast.

    if type(positive_tags[0]).__name__ == 'int':
        categorytodivide = 'firstpub'
    else:
        categorytodivide = 'tagset'

    IDsToUse, classdictionary, donttrainset = metafilter.label_classes(metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions)

    print()
    min, max = first_and_last(IDsToUse, metadict, datetype)
    if min > 0:
        print("The whole corpus involved here includes " + str(len(IDsToUse)))
        print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".")
        print()

    # We now create an ordered list of id-path tuples for later use, and identify a set of
    # positive ids that should never be used in training.

    volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse)

    # Extend the set of ids not to be used in training by identifying negative volumes that match
    # the distribution of positive volumes.

    describe_donttrainset(donttrainset, classdictionary, metadict, datetype)

    # Create a flag for each volume that indicates whether it was used in training

    record_trainflags(metadict, donttrainset)

    # Get a count of docfrequency for all words in the corpus. This is probably not needed and
    # might be deprecated later.

    wordcounts = get_docfrequency(volspresent, donttrainset)

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.

    vocablist = get_vocablist(vocabpath, volspresent, wordcounts, useall = True, n = numfeatures)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top 10k words in all files, and stores it there.
    # N is a parameter that could be altered right here.

    # Useall is a parameter that you basically don't need to worry about unless
    # you're changing / testing code. If you set it to false, the vocablist will
    # exclude words that occur very rarely. This shouldn't be necessary; the
    # crossvalidation routine is designed not to include features that occur
    # zero times in the training set. But if you get div-by-zero errors in the
    # training process, you could fiddle with this parameter as part of a
    # troubleshooting process.

    numfeatures = len(vocablist)

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    # This list will include for ALL volumes, the indexes of vols in the donttrainset.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadict[anid]['author']
            for idx2, anotherid in enumerate(orderedIDs):
                otherauthor = metadict[anotherid]['author']
                if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                    authormatches[idx1].append(idx2)
    else:
        # This code only runs if we're testing the effect of
        # holdout_authors by disabling it.

        for idx1, anid in enumerate(orderedIDs):
            if idx1 not in authormatches[idx1]:
                authormatches[idx1].append(idx1)

    # The purpose of everything that follows is to
    # balance negative and positive instances in each
    # training set.

    trainingpositives = set()
    trainingnegatives = set()

    for anid, thisclass in classdictionary.items():
        if anid in donttrainset:
            continue

        if thisclass == 1:
            trainingpositives.add(orderedIDs.index(anid))
        else:
            trainingnegatives.add(orderedIDs.index(anid))

    print('Training positives: ' + str(len(trainingpositives)))
    print('Training negatives: ' + str(len(trainingnegatives)))

    # The code below was intended to balance the size of positive and
    # negative in spite of same-author exclusions. But it could
    # have grossly unintended effects when there were many donttrainon
    # exclusions.

    # for alist in authormatches:
    #     numpositive = 0
    #     numnegative = 0
    #     for anidx in alist:
    #         anid = orderedIDs[anidx]
    #         thisclass = classdictionary[anid]
    #         if thisclass == 1:
    #             numpositive += 1
    #         else:
    #             numnegative += 1

    #     if numpositive > numnegative:
    #         difference = numpositive - numnegative
    #         remaining = trainingnegatives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     elif numpositive < numnegative:
    #         difference = numnegative - numpositive
    #         remaining = trainingpositives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     else:
    #         difference = 0

    # Let's record, for each volume, the size of its training set.

    trainingsizes = []

    numvolumes = len(orderedIDs)
    for idx, anid in enumerate(orderedIDs):
        excluded = len(authormatches[idx])
        metadict[anid]['trainsize'] = numvolumes - excluded
        trainingsizes.append(metadict[anid]['trainsize'])

    averagetrainingsize = sum(trainingsizes) / len(trainingsizes)

    for alist in authormatches:
        alist.sort(reverse = True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding = 'utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = metautils.infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date, totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            if totalcount == 0:
                totalcount = .00001
            voldata.append(features / totalcount)


        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes = 11)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        header = ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'trainsize', 'author', 'title', 'genretags']
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            dateused = metadata[datetype]
            pubdate = metadata['pubdate']
            birthdate = metadata['birthdate']
            firstpub = metadata['firstpub']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            trainflag = metadata['trainflag']
            trainsize = metadata['trainsize']
            genretags = ' | '.join(metadata['tagset'])
            outrow = [volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, trainflag, trainsize, author, title, genretags]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic == 0.5:
                print("equals!")
                predictedpositive = random.sample([True, False], 1)[0]
            elif logistic > 0.5:
                predictedpositive = True
            elif logistic < 0.5:
                predictedpositive = False
            else:
                print('Oh, joy. A fundamental floating point error.')
                predictedpositive = random.sample([True, False], 1)[0]

            if predictedpositive and classdictionary[volid] > 0.5:
                truepositives += 1
            elif not predictedpositive and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif not predictedpositive and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif predictedpositive and classdictionary[volid] < 0.5:
                falsepositives += 1
            else:
                print("Wait a second, boss.")

    donttrainon.sort(reverse = True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    trainingset, testset = modelingprocess.remove_zerocols(trainingset, testset)
    newmodel = LogisticRegression(C = regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives
    if totalevaluated != len(IDsToUse):
        print("Total evaluated = " + str(totalevaluated))
        print("But we've got " + str(len(IDsToUse)))
    accuracy = (truepositives + truenegatives) / totalevaluated
    print('True positives ' + str(truepositives))
    print('True negatives ' + str(truenegatives))
    print('False positives ' + str(falsepositives))
    print('False negatives ' + str(falsenegatives))

    print()
    print('The average size of the training set was ' + str(averagetrainingsize))
    print()

    precision = truepositives / (truepositives + falsepositives)
    recall = truepositives / (truepositives + falsenegatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    print("F1 : " + str(F1))


    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples