Пример #1
0
def get_data_for_model(paths, exclusions, classifyconditions):
    ''' Unpacks a bunch of parameters that define metadata
    conditions for positive and negative classes. Finds volumes
    meeting those conditions, creates a lexicon if one doesn't
    already exist, and creates a pandas dataframe storing
    texts as rows and words/features as columns.
    '''

    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions

    verbose = False
    holdout_authors = True

    # If you want reliable results, always run this with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference. If you do that, it
    # disables the code that keeps other works by the author being predicted
    # out of the training set.

    freqs_already_normalized = True

    # By default we assume that frequencies have already been normalized
    # (divided by the total number of words in the volume). This allows us
    # to use some features (like type/token ratio) that would become
    # meaningless if divided by total wordcount. But it means that I'm
    # offloading some important feature-engineering decisions to the
    # data prep stage.

    # The following function confirms that the testconditions are legal.

    confirm_testconditions(testconditions, positive_tags)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)

    # RANDOMNESS.

    # random.shuffle(allthefiles)

    # RANDOMNESS. This is an important line. Without it, you'd get the same sequence of
    # orderedIDs each time, and the same distribution of IDs into folds of the cross-
    # validation

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast.

    if type(positive_tags[0]).__name__ == 'int':
        categorytodivide = 'firstpub'
    else:
        categorytodivide = 'tagset'

    IDsToUse, classdictionary, donttrainset = metafilter.label_classes(
        metadict, categorytodivide, positive_tags, negative_tags, sizecap,
        datetype, excludeif, testconditions)

    print()
    min, max = first_and_last(IDsToUse, metadict, datetype)
    if min > 0:
        print("The whole corpus involved here includes " + str(len(IDsToUse)))
        print("volumes, ranging in date from " + str(min) + " to " + str(max) +
              ".")
        print()

    # We now create an ordered list of id-path tuples for later use, and identify a set of
    # positive ids that should never be used in training.

    volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths,
                                               IDsToUse)

    # Extend the set of ids not to be used in training by identifying negative volumes that match
    # the distribution of positive volumes.

    describe_donttrainset(donttrainset, classdictionary, metadict, datetype)

    # Create a flag for each volume that indicates whether it was used in training

    record_trainflags(metadict, donttrainset)

    # Get a count of docfrequency for all words in the corpus. This is probably not needed and
    # might be deprecated later.

    # wordcounts = get_docfrequency(volspresent, donttrainset)

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.

    vocablist = get_vocablist(vocabpath,
                              volspresent,
                              useall=True,
                              n=numfeatures)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top 10k words in all files, and stores it there.
    # N is a parameter that could be altered right here.

    # Useall is a parameter that you basically don't need to worry about unless
    # you're changing / testing code. If you set it to false, the vocablist will
    # exclude words that occur very rarely. This shouldn't be necessary; the
    # crossvalidation routine is designed not to include features that occur
    # zero times in the training set. But if you get div-by-zero errors in the
    # training process, you could fiddle with this parameter as part of a
    # troubleshooting process.

    numfeatures = len(vocablist)
    print()
    print("Number of features " + str(numfeatures))

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    # This list will include for ALL volumes, the indexes of vols in the donttrainset.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadict[anid]['author']
            for idx2, anotherid in enumerate(orderedIDs):
                otherauthor = metadict[anotherid]['author']
                if thisauthor == otherauthor and not idx2 in authormatches[
                        idx1]:
                    authormatches[idx1].append(idx2)
    else:
        # This code only runs if we're testing the effect of
        # holdout_authors by disabling it.

        for idx1, anid in enumerate(orderedIDs):
            if idx1 not in authormatches[idx1]:
                authormatches[idx1].append(idx1)

    # The purpose of everything that follows is to
    # balance negative and positive instances in each
    # training set.

    trainingpositives = set()
    trainingnegatives = set()

    for anid, thisclass in classdictionary.items():
        if anid in donttrainset:
            continue

        if thisclass == 1:
            trainingpositives.add(orderedIDs.index(anid))
        else:
            trainingnegatives.add(orderedIDs.index(anid))

    print('Training positives: ' + str(len(trainingpositives)))
    print('Training negatives: ' + str(len(trainingnegatives)))

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    masterdata, classvector, metadict = get_dataframe(
        metadict, volspresent, classdictionary, vocablist,
        freqs_already_normalized)

    return metadict, masterdata, classvector, classdictionary, orderedIDs, donttrainon, donttrainset, authormatches, vocablist
Пример #2
0
def create_model(paths, exclusions, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions

    verbose = False
    holdout_authors = True

    # If you want reliable results, always run this with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference. If you do that, it
    # disables the code that keeps other works by the author being predicted
    # out of the training set.

    # The following function confirms that the testconditions are legal.

    confirm_testconditions(testconditions, positive_tags)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast.

    if type(positive_tags[0]).__name__ == 'int':
        categorytodivide = 'firstpub'
    else:
        categorytodivide = 'tagset'

    IDsToUse, classdictionary, donttrainset = metafilter.label_classes(
        metadict, categorytodivide, positive_tags, negative_tags, sizecap,
        datetype, excludeif, testconditions)

    print()
    min, max = first_and_last(IDsToUse, metadict, datetype)
    if min > 0:
        print("The whole corpus involved here includes " + str(len(IDsToUse)))
        print("volumes, ranging in date from " + str(min) + " to " + str(max) +
              ".")
        print()

    # We now create an ordered list of id-path tuples for later use, and identify a set of
    # positive ids that should never be used in training.

    volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths,
                                               IDsToUse)

    # Extend the set of ids not to be used in training by identifying negative volumes that match
    # the distribution of positive volumes.

    describe_donttrainset(donttrainset, classdictionary, metadict, datetype)

    # Create a flag for each volume that indicates whether it was used in training

    record_trainflags(metadict, donttrainset)

    # Get a count of docfrequency for all words in the corpus. This is probably not needed and
    # might be deprecated later.

    wordcounts = get_docfrequency(volspresent, donttrainset)

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.

    vocablist = get_vocablist(vocabpath,
                              volspresent,
                              wordcounts,
                              useall=True,
                              n=numfeatures)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top 10k words in all files, and stores it there.
    # N is a parameter that could be altered right here.

    # Useall is a parameter that you basically don't need to worry about unless
    # you're changing / testing code. If you set it to false, the vocablist will
    # exclude words that occur very rarely. This shouldn't be necessary; the
    # crossvalidation routine is designed not to include features that occur
    # zero times in the training set. But if you get div-by-zero errors in the
    # training process, you could fiddle with this parameter as part of a
    # troubleshooting process.

    numfeatures = len(vocablist)

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    # This list will include for ALL volumes, the indexes of vols in the donttrainset.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadict[anid]['author']
            for idx2, anotherid in enumerate(orderedIDs):
                otherauthor = metadict[anotherid]['author']
                if thisauthor == otherauthor and not idx2 in authormatches[
                        idx1]:
                    authormatches[idx1].append(idx2)
    else:
        # This code only runs if we're testing the effect of
        # holdout_authors by disabling it.

        for idx1, anid in enumerate(orderedIDs):
            if idx1 not in authormatches[idx1]:
                authormatches[idx1].append(idx1)

    # The purpose of everything that follows is to
    # balance negative and positive instances in each
    # training set.

    trainingpositives = set()
    trainingnegatives = set()

    for anid, thisclass in classdictionary.items():
        if anid in donttrainset:
            continue

        if thisclass == 1:
            trainingpositives.add(orderedIDs.index(anid))
        else:
            trainingnegatives.add(orderedIDs.index(anid))

    print('Training positives: ' + str(len(trainingpositives)))
    print('Training negatives: ' + str(len(trainingnegatives)))

    # The code below was intended to balance the size of positive and
    # negative in spite of same-author exclusions. But it could
    # have grossly unintended effects when there were many donttrainon
    # exclusions.

    # for alist in authormatches:
    #     numpositive = 0
    #     numnegative = 0
    #     for anidx in alist:
    #         anid = orderedIDs[anidx]
    #         thisclass = classdictionary[anid]
    #         if thisclass == 1:
    #             numpositive += 1
    #         else:
    #             numnegative += 1

    #     if numpositive > numnegative:
    #         difference = numpositive - numnegative
    #         remaining = trainingnegatives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     elif numpositive < numnegative:
    #         difference = numnegative - numpositive
    #         remaining = trainingpositives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     else:
    #         difference = 0

    # Let's record, for each volume, the size of its training set.

    trainingsizes = []

    numvolumes = len(orderedIDs)
    for idx, anid in enumerate(orderedIDs):
        excluded = len(authormatches[idx])
        metadict[anid]['trainsize'] = numvolumes - excluded
        trainingsizes.append(metadict[anid]['trainsize'])

    averagetrainingsize = sum(trainingsizes) / len(trainingsizes)

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding='utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = metautils.infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date,
                                              totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            if totalcount == 0:
                totalcount = .00001
            voldata.append(features / totalcount)

        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes=11)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        header = [
            'volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender',
            'nation', 'allwords', 'logistic', 'realclass', 'trainflag',
            'trainsize', 'author', 'title', 'genretags'
        ]
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            dateused = metadata[datetype]
            pubdate = metadata['pubdate']
            birthdate = metadata['birthdate']
            firstpub = metadata['firstpub']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            trainflag = metadata['trainflag']
            trainsize = metadata['trainsize']
            genretags = ' | '.join(metadata['tagset'])
            outrow = [
                volid, dateused, pubdate, birthdate, firstpub, gender, nation,
                allwords, logistic, realclass, trainflag, trainsize, author,
                title, genretags
            ]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic == 0.5:
                print("equals!")
                predictedpositive = random.sample([True, False], 1)[0]
            elif logistic > 0.5:
                predictedpositive = True
            elif logistic < 0.5:
                predictedpositive = False
            else:
                print('Oh, joy. A fundamental floating point error.')
                predictedpositive = random.sample([True, False], 1)[0]

            if predictedpositive and classdictionary[volid] > 0.5:
                truepositives += 1
            elif not predictedpositive and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif not predictedpositive and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif predictedpositive and classdictionary[volid] < 0.5:
                falsepositives += 1
            else:
                print("Wait a second, boss.")

    donttrainon.sort(reverse=True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    trainingset, testset = modelingprocess.remove_zerocols(
        trainingset, testset)
    newmodel = LogisticRegression(C=regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(
        zip(coefficients, (coefficients / np.array(stdevs)),
            vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives
    if totalevaluated != len(IDsToUse):
        print("Total evaluated = " + str(totalevaluated))
        print("But we've got " + str(len(IDsToUse)))
    accuracy = (truepositives + truenegatives) / totalevaluated
    print('True positives ' + str(truepositives))
    print('True negatives ' + str(truenegatives))
    print('False positives ' + str(falsepositives))
    print('False negatives ' + str(falsenegatives))

    print()
    print('The average size of the training set was ' +
          str(averagetrainingsize))
    print()

    precision = truepositives / (truepositives + falsepositives)
    recall = truepositives / (truepositives + falsenegatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    print("F1 : " + str(F1))

    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
def make_dunnings(paths, exclusions, thresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold = thresholds
    category2sorton, positive_class, datetype, numfeatures, regularization = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    # The default condition here is

    # category2sorton = 'reviewed'
    # positive_class = 'rev'
    # sizecap = 350
    # A sizecap less than one means, no sizecap.

    IDsToUse, classdictionary = metafilter.label_classes(
        metadict, category2sorton, positive_class, sizecap)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            continue
        else:
            with open(volpath, encoding='utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        count = int(fields[1])
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]

    # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
    # Feature selection is deprecated. There are cool things
    # we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.
    # So we just take the most common words (by number of documents containing them)
    # in the whole corpus. Technically, I suppose, we could crossvalidate that as well,
    # but *eyeroll*.

    donttrainon = list()

    # Here we create a list of volumed IDs not to be used for training.
    # For instance, we have supplemented the dataset with volumes that
    # are in the Norton but that did not actually occur in random
    # sampling. We want to make predictions for these, but never use
    # them for training.

    for idx1, anid in enumerate(orderedIDs):
        reviewedstatus = metadict[anid]['reviewed']
        date = infer_date(metadict[anid], datetype)
        if reviewedstatus == 'addedbecausecanon':
            donttrainon.append(idx1)
        elif date < pastthreshold or date > futurethreshold:
            donttrainon.append(idx1)

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    randomdata = list()
    revieweddata = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding='utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        classflag = classdictionary[volid]
        features = get_features(voldict, vocablist)
        if classflag == 0:
            randomdata.append(features)
        else:
            revieweddata.append(features)

    randomdata = pd.DataFrame(randomdata)
    revieweddata = pd.DataFrame(revieweddata)

    randomrows = randomdata.shape[0]
    randsum = 0
    for i in range(randomrows):
        randsum += sum(randomdata.iloc[i, :])

    reviewedrows = revieweddata.shape[0]
    revsum = 0
    for i in range(reviewedrows):
        revsum += sum(revieweddata.iloc[i, :])

    dunningdict = dict()
    for idx, word in enumerate(vocablist):
        signed_dunnings, bns, ratio, mwu, mwp = dunnings(
            randomdata, randsum, revieweddata, revsum, idx)
        dunningdict[word] = (signed_dunnings, bns, ratio, mwu, mwp)

    with open('dunnings.csv', mode='w', encoding='utf-8') as f:
        writer = csv.DictWriter(
            f, fieldnames=['word', 'dunnings', 'bns', 'ratio', 'mwu', 'mwp'])
        writer.writeheader()
        for word, value in dunningdict.items():
            row = dict()
            row['word'] = word
            row['dunnings'] = value[0]
            row['bns'] = value[1]
            row['ratio'] = value[2]
            row['mwu'] = value[3]
            row['mwp'] = value[4]
            writer.writerow(row)
def create_model(paths, exclusions, thresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold = thresholds
    category2sorton, positive_class, datetype = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    # The default condition here is

    # category2sorton = 'reviewed'
    # positive_class = 'rev'
    # sizecap = 350
    # A sizecap less than one means, no sizecap.

    IDsToUse, classdictionary = metafilter.label_classes(
        metadict, category2sorton, positive_class, sizecap)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            continue
        else:
            with open(volpath, encoding='utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        count = int(fields[1])
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    vocablist = [x[0] for x in wordcounts.most_common(3200)]

    #vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)

    # Deprecated. There are more sophisticated things we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.

    VOCABSIZE = len(vocablist)

    donttrainon = list()

    # Here we create a list of volumed IDs not to be used for training.
    # For instance, we have supplemented the dataset with volumes that
    # are in the Norton but that did not actually occur in random
    # sampling. We want to make predictions for these, but never use
    # them for training.

    for idx1, anid in enumerate(orderedIDs):
        reviewedstatus = metadict[anid]['reviewed']
        date = infer_date(metadict[anid], datetype)
        if reviewedstatus == 'addedbecausecanon':
            donttrainon.append(idx1)
        elif date < pastthreshold or date > futurethreshold:
            donttrainon.append(idx1)

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding='utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date,
                                              totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            voldata.append(features / (totalcount + 0.001))

        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    fivetuples = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        afivetuple = data, classvector, listtoexclude, i, usedate
        fivetuples.append(afivetuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes=12)
    res = pool.map_async(modelingprocess.model_one_volume, fivetuples)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        header = [
            'volid', 'reviewed', 'obscure', 'pubdate', 'birthdate', 'gender',
            'nation', 'allwords', 'logistic', 'author', 'title', 'pubname',
            'actually', 'realclass'
        ]
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            reviewed = metadata['reviewed']
            obscure = metadata['obscure']
            pubdate = infer_date(metadata, datetype)
            birthdate = metadata['birthdate']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            canonicity = metadata['canonicity']
            pubname = metadata['pubname']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            outrow = [
                volid, reviewed, obscure, pubdate, birthdate, gender, nation,
                allwords, logistic, author, title, pubname, canonicity,
                realclass
            ]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic > 0.5 and classdictionary[volid] > 0.5:
                truepositives += 1
            elif logistic <= 0.5 and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif logistic <= 0.5 and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif logistic > 0.5 and classdictionary[volid] < 0.5:
                falsepositives += 1

    donttrainon.sort(reverse=True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    newmodel = LogisticRegression(C=.00007)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 1000000

    coefficientuples = list(
        zip(coefficients, (coefficients / np.array(stdevs)),
            vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    accuracy = (truepositives + truenegatives) / len(IDsToUse)

    with open('coefficients.csv', mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
def create_model(paths, exclusions, thresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold = thresholds
    category2sorton, positive_class, datetype, numfeatures, regularization = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    # The default condition here is

    # category2sorton = 'reviewed'
    # positive_class = 'rev'
    # sizecap = 350
    # A sizecap less than one means, no sizecap.

    IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton, positive_class, sizecap)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            continue
        else:
            with open(volpath, encoding = 'utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        count = float(fields[1])
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    if sourcefolder=="poems/":
        vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]
    else:
        # In an SRT, we can just take them arbitrarily. The top ten is [V0,V1,V2,...,V10]
        vocablist = ["V" + str(i) for i in range(numfeatures)]
        
    # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
    # Feature selection is deprecated. There are cool things
    # we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.
    # So we just take the most common words (by number of documents containing them)
    # in the whole corpus. Technically, I suppose, we could crossvalidate that as well,
    # but *eyeroll*.

    donttrainon = list()

    # Here we create a list of volumed IDs not to be used for training.
    # For instance, we have supplemented the dataset with volumes that
    # are in the Norton but that did not actually occur in random
    # sampling. We want to make predictions for these, but never use
    # them for training.

    for idx1, anid in enumerate(orderedIDs):
        reviewedstatus = metadict[anid]['reviewed']
        date = infer_date(metadict[anid], datetype)
        if reviewedstatus == 'addedbecausecanon':
            donttrainon.append(idx1)
        elif date < pastthreshold or date > futurethreshold:
            donttrainon.append(idx1)

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse = True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding = 'utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = float(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date, totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)

            if sourcefolder=="poems/":
                voldata.append(features / (totalcount + 0.001))
            else:
                # For SRT transformations, normalization is already handled
                voldata.append(features)
                
        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes = 4)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        header = ['volid', 'reviewed', 'obscure', 'pubdate', 'birthdate', 'gender', 'nation', 'allwords', 'logistic', 'author', 'title', 'pubname', 'actually', 'realclass']
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            reviewed = metadata['reviewed']
            obscure = metadata['obscure']
            pubdate = infer_date(metadata, datetype)
            birthdate = metadata['birthdate']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            canonicity = metadata['canonicity']
            pubname = metadata['pubname']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            outrow = [volid, reviewed, obscure, pubdate, birthdate, gender, nation, allwords, logistic, author, title, pubname, canonicity, realclass]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic > 0.5 and classdictionary[volid] > 0.5:
                truepositives += 1
            elif logistic <= 0.5 and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif logistic <= 0.5 and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif logistic > 0.5 and classdictionary[volid] < 0.5:
                falsepositives += 1

    donttrainon.sort(reverse = True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    newmodel = LogisticRegression(C = regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    accuracy = (truepositives + truenegatives) / len(IDsToUse)

    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
Пример #6
0
excludeif = dict()
# excludeif['impaud'] = 'pop'
excludeif['pubname'] = 'TEM'
excludeif['recept'] = 'addcanon'
#excludeif['gender'] = 'm'
excludeifnot = dict()
#excludeifnot['gender'] = 'm'
excludeabove = dict()
excludebelow = dict()

excludebelow['inferreddate'] = 1700
excludeabove['inferreddate'] = 1950
futurethreshold = 1950

metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

# Now that we have a list of volumes with metadata, we can select the groups of IDs
# that we actually intend to contrast. If we want to us more or less everything,
# this may not be necessary. But in some cases we want to use randomly sampled subsets.

# IDsToUse = set([x for x in metadict.keys()])

# The default condition here is

category2sorton = 'reviewed'
positive_class = 'rev'
sizecap = 350
# A sizecap less than one means, no sizecap.

IDsToUse, classdictionary = metafilter.balance_classes(metadict, category2sorton, positive_class, sizecap)
excludeif = dict()
# excludeif['impaud'] = 'pop'
excludeif['pubname'] = 'TEM'
excludeif['recept'] = 'addcanon'
#excludeif['gender'] = 'm'
excludeifnot = dict()
#excludeifnot['gender'] = 'm'
excludeabove = dict()
excludebelow = dict()

excludebelow['inferreddate'] = 1700
excludeabove['inferreddate'] = 1950
futurethreshold = 1950

metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

# Now that we have a list of volumes with metadata, we can select the groups of IDs
# that we actually intend to contrast. If we want to us more or less everything,
# this may not be necessary. But in some cases we want to use randomly sampled subsets.

# IDsToUse = set([x for x in metadict.keys()])

# The default condition here is

category2sorton = 'reviewed'
positive_class = 'rev'
sizecap = 350
# A sizecap less than one means, no sizecap.

IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton, positive_class, sizecap)
Пример #8
0
def create_model(paths, exclusions, trainthresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold, donottraintag = trainthresholds
    categorytodivideon, positive_tags, negative_tag, datetype, numfeatures, regularization = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif,
                                       excludeifnot, excludebelow,
                                       excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    IDsToUse, classdictionary = metafilter.label_classes(
        metadict, categorytodivideon, positive_tags, negative_tag, sizecap,
        datetype)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    # At the same time we're going to create a set of volumes
    # that should never be included in the training set.

    donttrainset = set()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        # The following two if statements catch volumes that should
        # be predicted but not trained on.

        # We add them to the donttrain on set, and also 'continue' so
        # they are not used to create vocabulary.

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            donttrainset.add(volid)
            continue

        tagset = metadict[volid]['tagset']
        if donottraintag in tagset:
            donttrainset.add(volid)
            continue

        else:
            with open(volpath, encoding='utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.
    # However, we don't want to include words that actually occur zero times in
    # the particular set we're modeling. So we check.

    vocablist = []
    with open('../lexicon/top10k.csv', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            word = row['word'].strip()
            if wordcounts[word] > 2:
                vocablist.append(word)

    numfeatures = len(vocablist)

    # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
    # Feature selection is deprecated. There are cool things
    # we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.

    # We need a list of indexes in orderedIDs to exclude.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse=True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding='utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date,
                                              totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            voldata.append(features / (totalcount + 0.001))

        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes=12)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        header = [
            'volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender',
            'nation', 'allwords', 'logistic', 'realclass', 'author', 'title',
            'genretags'
        ]
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            dateused = metadata[datetype]
            pubdate = metadata['pubdate']
            birthdate = metadata['birthdate']
            firstpub = metadata['firstpub']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            genretags = ' | '.join(metadata['tagset'])
            outrow = [
                volid, dateused, pubdate, birthdate, firstpub, gender, nation,
                allwords, logistic, realclass, author, title, genretags
            ]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic > 0.5 and classdictionary[volid] > 0.5:
                truepositives += 1
            elif logistic <= 0.5 and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif logistic <= 0.5 and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif logistic > 0.5 and classdictionary[volid] < 0.5:
                falsepositives += 1

    donttrainon.sort(reverse=True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    newmodel = LogisticRegression(C=regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(
        zip(coefficients, (coefficients / np.array(stdevs)),
            vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    accuracy = (truepositives + truenegatives) / len(IDsToUse)

    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode='w', encoding='utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
Пример #9
0
def create_model(paths, exclusions, trainthresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold, donottraintag = trainthresholds
    categorytodivideon, positive_tags, negative_tag, datetype, numfeatures, regularization = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    IDsToUse, classdictionary = metafilter.label_classes(metadict, categorytodivideon, positive_tags, negative_tag, sizecap, datetype)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    # At the same time we're going to create a set of volumes
    # that should never be included in the training set.

    donttrainset = set()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        # The following two if statements catch volumes that should
        # be predicted but not trained on.

        # We add them to the donttrain on set, and also 'continue' so
        # they are not used to create vocabulary.

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            donttrainset.add(volid)
            continue

        tagset = metadict[volid]['tagset']
        if donottraintag in tagset:
            donttrainset.add(volid)
            continue

        else:
            with open(volpath, encoding = 'utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.
    # However, we don't want to include words that actually occur zero times in
    # the particular set we're modeling. So we check.

    vocablist = []
    with open('../lexicon/top10k.csv', encoding = 'utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            word = row['word'].strip()
            if wordcounts[word] > 2:
                vocablist.append(word)

    numfeatures = len(vocablist)

    # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
    # Feature selection is deprecated. There are cool things
    # we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.

    # We need a list of indexes in orderedIDs to exclude.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse = True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding = 'utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date, totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            voldata.append(features / (totalcount + 0.001))


        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes = 12)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        header = ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'author', 'title', 'genretags']
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            dateused = metadata[datetype]
            pubdate = metadata['pubdate']
            birthdate = metadata['birthdate']
            firstpub = metadata['firstpub']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            genretags = ' | '.join(metadata['tagset'])
            outrow = [volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, author, title, genretags]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic > 0.5 and classdictionary[volid] > 0.5:
                truepositives += 1
            elif logistic <= 0.5 and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif logistic <= 0.5 and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif logistic > 0.5 and classdictionary[volid] < 0.5:
                falsepositives += 1

    donttrainon.sort(reverse = True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    newmodel = LogisticRegression(C = regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    accuracy = (truepositives + truenegatives) / len(IDsToUse)

    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
def get_data_for_model(paths, exclusions, classifyconditions):
    ''' Unpacks a bunch of parameters that define metadata
    conditions for positive and negative classes. Finds volumes
    meeting those conditions, creates a lexicon if one doesn't
    already exist, and creates a pandas dataframe storing
    texts as rows and words/features as columns.
    '''

    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions

    verbose = False
    holdout_authors = True

    # If you want reliable results, always run this with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference. If you do that, it
    # disables the code that keeps other works by the author being predicted
    # out of the training set.

    freqs_already_normalized = True

    # By default we assume that frequencies have already been normalized
    # (divided by the total number of words in the volume). This allows us
    # to use some features (like type/token ratio) that would become
    # meaningless if divided by total wordcount. But it means that I'm
    # offloading some important feature-engineering decisions to the
    # data prep stage.

    # The following function confirms that the testconditions are legal.

    confirm_testconditions(testconditions, positive_tags)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)

    # RANDOMNESS.

    # random.shuffle(allthefiles)

    # RANDOMNESS. This is an important line. Without it, you'd get the same sequence of
    # orderedIDs each time, and the same distribution of IDs into folds of the cross-
    # validation

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast.

    if type(positive_tags[0]).__name__ == 'int':
        categorytodivide = 'firstpub'
    else:
        categorytodivide = 'tagset'

    IDsToUse, classdictionary, donttrainset = metafilter.label_classes(metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions)

    print()
    min, max = first_and_last(IDsToUse, metadict, datetype)
    if min > 0:
        print("The whole corpus involved here includes " + str(len(IDsToUse)))
        print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".")
        print()

    # We now create an ordered list of id-path tuples for later use, and identify a set of
    # positive ids that should never be used in training.

    volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse)

    # Extend the set of ids not to be used in training by identifying negative volumes that match
    # the distribution of positive volumes.

    describe_donttrainset(donttrainset, classdictionary, metadict, datetype)

    # Create a flag for each volume that indicates whether it was used in training

    record_trainflags(metadict, donttrainset)

    # Get a count of docfrequency for all words in the corpus. This is probably not needed and
    # might be deprecated later.

    # wordcounts = get_docfrequency(volspresent, donttrainset)

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.

    vocablist = get_vocablist(vocabpath, volspresent, useall = True, n = numfeatures)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top 10k words in all files, and stores it there.
    # N is a parameter that could be altered right here.

    # Useall is a parameter that you basically don't need to worry about unless
    # you're changing / testing code. If you set it to false, the vocablist will
    # exclude words that occur very rarely. This shouldn't be necessary; the
    # crossvalidation routine is designed not to include features that occur
    # zero times in the training set. But if you get div-by-zero errors in the
    # training process, you could fiddle with this parameter as part of a
    # troubleshooting process.

    numfeatures = len(vocablist)
    print()
    print("Number of features " + str(numfeatures))

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    # This list will include for ALL volumes, the indexes of vols in the donttrainset.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadict[anid]['author']
            for idx2, anotherid in enumerate(orderedIDs):
                otherauthor = metadict[anotherid]['author']
                if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                    authormatches[idx1].append(idx2)
    else:
        # This code only runs if we're testing the effect of
        # holdout_authors by disabling it.

        for idx1, anid in enumerate(orderedIDs):
            if idx1 not in authormatches[idx1]:
                authormatches[idx1].append(idx1)


    # The purpose of everything that follows is to
    # balance negative and positive instances in each
    # training set.

    trainingpositives = set()
    trainingnegatives = set()

    for anid, thisclass in classdictionary.items():
        if anid in donttrainset:
            continue

        if thisclass == 1:
            trainingpositives.add(orderedIDs.index(anid))
        else:
            trainingnegatives.add(orderedIDs.index(anid))

    print('Training positives: ' + str(len(trainingpositives)))
    print('Training negatives: ' + str(len(trainingnegatives)))


    for alist in authormatches:
        alist.sort(reverse = True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    masterdata, classvector, metadict = get_dataframe(metadict, volspresent, classdictionary, vocablist, freqs_already_normalized)

    return metadict, masterdata, classvector, classdictionary, orderedIDs, donttrainon, donttrainset, authormatches, vocablist
def create_model(paths, exclusions, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, metadatapath, outputpath, vocabpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions

    verbose = False
    holdout_authors = True

    # If you want reliable results, always run this with holdout_authors
    # set to True. The only reason to set it to False is to confirm that
    # this flag is actually making a difference. If you do that, it
    # disables the code that keeps other works by the author being predicted
    # out of the training set.

    # The following function confirms that the testconditions are legal.

    confirm_testconditions(testconditions, positive_tags)

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast.

    if type(positive_tags[0]).__name__ == 'int':
        categorytodivide = 'firstpub'
    else:
        categorytodivide = 'tagset'

    IDsToUse, classdictionary, donttrainset = metafilter.label_classes(metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions)

    print()
    min, max = first_and_last(IDsToUse, metadict, datetype)
    if min > 0:
        print("The whole corpus involved here includes " + str(len(IDsToUse)))
        print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".")
        print()

    # We now create an ordered list of id-path tuples for later use, and identify a set of
    # positive ids that should never be used in training.

    volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse)

    # Extend the set of ids not to be used in training by identifying negative volumes that match
    # the distribution of positive volumes.

    describe_donttrainset(donttrainset, classdictionary, metadict, datetype)

    # Create a flag for each volume that indicates whether it was used in training

    record_trainflags(metadict, donttrainset)

    # Get a count of docfrequency for all words in the corpus. This is probably not needed and
    # might be deprecated later.

    wordcounts = get_docfrequency(volspresent, donttrainset)

    # The feature list we use is defined by the top 10,000 words (by document
    # frequency) in the whole corpus, and it will be the same for all models.

    vocablist = get_vocablist(vocabpath, volspresent, wordcounts, useall = True, n = numfeatures)

    # This function either gets the vocabulary list already stored in vocabpath, or
    # creates a list of the top 10k words in all files, and stores it there.
    # N is a parameter that could be altered right here.

    # Useall is a parameter that you basically don't need to worry about unless
    # you're changing / testing code. If you set it to false, the vocablist will
    # exclude words that occur very rarely. This shouldn't be necessary; the
    # crossvalidation routine is designed not to include features that occur
    # zero times in the training set. But if you get div-by-zero errors in the
    # training process, you could fiddle with this parameter as part of a
    # troubleshooting process.

    numfeatures = len(vocablist)

    # For each volume, we're going to create a list of volumes that should be
    # excluded from the training set when it is to be predicted. More precisely,
    # we're going to create a list of their *indexes*, so that we can easily
    # remove rows from the training matrix.

    # This list will include for ALL volumes, the indexes of vols in the donttrainset.

    donttrainon = [orderedIDs.index(x) for x in donttrainset]

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]

    # Now we proceed to enlarge that list by identifying, for each volume,
    # a set of indexes that have the same author. Obvs, there will always be at least one.
    # We exclude a vol from it's own training set.

    if holdout_authors:
        for idx1, anid in enumerate(orderedIDs):
            thisauthor = metadict[anid]['author']
            for idx2, anotherid in enumerate(orderedIDs):
                otherauthor = metadict[anotherid]['author']
                if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                    authormatches[idx1].append(idx2)
    else:
        # This code only runs if we're testing the effect of
        # holdout_authors by disabling it.

        for idx1, anid in enumerate(orderedIDs):
            if idx1 not in authormatches[idx1]:
                authormatches[idx1].append(idx1)

    # The purpose of everything that follows is to
    # balance negative and positive instances in each
    # training set.

    trainingpositives = set()
    trainingnegatives = set()

    for anid, thisclass in classdictionary.items():
        if anid in donttrainset:
            continue

        if thisclass == 1:
            trainingpositives.add(orderedIDs.index(anid))
        else:
            trainingnegatives.add(orderedIDs.index(anid))

    print('Training positives: ' + str(len(trainingpositives)))
    print('Training negatives: ' + str(len(trainingnegatives)))

    # The code below was intended to balance the size of positive and
    # negative in spite of same-author exclusions. But it could
    # have grossly unintended effects when there were many donttrainon
    # exclusions.

    # for alist in authormatches:
    #     numpositive = 0
    #     numnegative = 0
    #     for anidx in alist:
    #         anid = orderedIDs[anidx]
    #         thisclass = classdictionary[anid]
    #         if thisclass == 1:
    #             numpositive += 1
    #         else:
    #             numnegative += 1

    #     if numpositive > numnegative:
    #         difference = numpositive - numnegative
    #         remaining = trainingnegatives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     elif numpositive < numnegative:
    #         difference = numnegative - numpositive
    #         remaining = trainingpositives - set(alist)
    #         alist.extend(random.sample(remaining, difference))
    #     else:
    #         difference = 0

    # Let's record, for each volume, the size of its training set.

    trainingsizes = []

    numvolumes = len(orderedIDs)
    for idx, anid in enumerate(orderedIDs):
        excluded = len(authormatches[idx])
        metadict[anid]['trainsize'] = numvolumes - excluded
        trainingsizes.append(metadict[anid]['trainsize'])

    averagetrainingsize = sum(trainingsizes) / len(trainingsizes)

    for alist in authormatches:
        alist.sort(reverse = True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    volsizes = dict()
    voldata = list()
    classvector = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding = 'utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = metautils.infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        if usedate:
            features = get_features_with_date(voldict, vocablist, date, totalcount)
            voldata.append(features)
        else:
            features = get_features(voldict, vocablist)
            if totalcount == 0:
                totalcount = .00001
            voldata.append(features / totalcount)


        volsizes[volid] = totalcount
        classflag = classdictionary[volid]
        classvector.append(classflag)

    data = pd.DataFrame(voldata)

    sextuplets = list()
    for i, volid in enumerate(orderedIDs):
        listtoexclude = authormatches[i]
        asixtuple = data, classvector, listtoexclude, i, usedate, regularization
        sextuplets.append(asixtuple)

    # Now do leave-one-out predictions.
    print('Beginning multiprocessing.')

    pool = Pool(processes = 11)
    res = pool.map_async(modelingprocess.model_one_volume, sextuplets)

    # After all files are processed, write metadata, errorlog, and counts of phrases.
    res.wait()
    resultlist = res.get()

    assert len(resultlist) == len(orderedIDs)

    logisticpredictions = dict()
    for i, volid in enumerate(orderedIDs):
        logisticpredictions[volid] = resultlist[i]

    pool.close()
    pool.join()

    print('Multiprocessing concluded.')

    truepositives = 0
    truenegatives = 0
    falsepositives = 0
    falsenegatives = 0
    allvolumes = list()

    with open(outputpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        header = ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'trainsize', 'author', 'title', 'genretags']
        writer.writerow(header)
        for volid in IDsToUse:
            metadata = metadict[volid]
            dateused = metadata[datetype]
            pubdate = metadata['pubdate']
            birthdate = metadata['birthdate']
            firstpub = metadata['firstpub']
            gender = metadata['gender']
            nation = metadata['nation']
            author = metadata['author']
            title = metadata['title']
            allwords = volsizes[volid]
            logistic = logisticpredictions[volid]
            realclass = classdictionary[volid]
            trainflag = metadata['trainflag']
            trainsize = metadata['trainsize']
            genretags = ' | '.join(metadata['tagset'])
            outrow = [volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, trainflag, trainsize, author, title, genretags]
            writer.writerow(outrow)
            allvolumes.append(outrow)

            if logistic == 0.5:
                print("equals!")
                predictedpositive = random.sample([True, False], 1)[0]
            elif logistic > 0.5:
                predictedpositive = True
            elif logistic < 0.5:
                predictedpositive = False
            else:
                print('Oh, joy. A fundamental floating point error.')
                predictedpositive = random.sample([True, False], 1)[0]

            if predictedpositive and classdictionary[volid] > 0.5:
                truepositives += 1
            elif not predictedpositive and classdictionary[volid] < 0.5:
                truenegatives += 1
            elif not predictedpositive and classdictionary[volid] > 0.5:
                falsenegatives += 1
            elif predictedpositive and classdictionary[volid] < 0.5:
                falsepositives += 1
            else:
                print("Wait a second, boss.")

    donttrainon.sort(reverse = True)
    trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0)
    trainingset, testset = modelingprocess.remove_zerocols(trainingset, testset)
    newmodel = LogisticRegression(C = regularization)
    trainingset, means, stdevs = normalizearray(trainingset, usedate)
    newmodel.fit(trainingset, yvals)

    coefficients = newmodel.coef_[0] * 100

    coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date']))
    coefficientuples.sort()
    if verbose:
        for coefficient, normalizedcoef, word in coefficientuples:
            print(word + " :  " + str(coefficient))

    print()
    totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives
    if totalevaluated != len(IDsToUse):
        print("Total evaluated = " + str(totalevaluated))
        print("But we've got " + str(len(IDsToUse)))
    accuracy = (truepositives + truenegatives) / totalevaluated
    print('True positives ' + str(truepositives))
    print('True negatives ' + str(truenegatives))
    print('False positives ' + str(falsepositives))
    print('False negatives ' + str(falsenegatives))

    print()
    print('The average size of the training set was ' + str(averagetrainingsize))
    print()

    precision = truepositives / (truepositives + falsepositives)
    recall = truepositives / (truepositives + falsenegatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    print("F1 : " + str(F1))


    coefficientpath = outputpath.replace('.csv', '.coefs.csv')
    with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        for triple in coefficientuples:
            coef, normalizedcoef, word = triple
            writer.writerow([word, coef, normalizedcoef])

    return accuracy, allvolumes, coefficientuples
excludeif = dict()
# excludeif['impaud'] = 'pop'
excludeif['pubname'] = 'TEM'
excludeif['recept'] = 'addcanon'
#excludeif['gender'] = 'm'
excludeifnot = dict()
#excludeifnot['gender'] = 'm'
excludeabove = dict()
excludebelow = dict()

excludebelow['inferreddate'] = 1700
excludeabove['inferreddate'] = 1950
futurethreshold = 1950

metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif,
                                   excludeifnot, excludebelow, excludeabove)

# Now that we have a list of volumes with metadata, we can select the groups of IDs
# that we actually intend to contrast. If we want to us more or less everything,
# this may not be necessary. But in some cases we want to use randomly sampled subsets.

# IDsToUse = set([x for x in metadict.keys()])

# The default condition here is

category2sorton = 'reviewed'
positive_class = 'rev'
sizecap = 350
# A sizecap less than one means, no sizecap.

IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton,
Пример #13
0
excludeif = dict()
# excludeif['impaud'] = 'pop'
excludeif['pubname'] = 'TEM'
excludeif['recept'] = 'addcanon'
#excludeif['gender'] = 'm'
excludeifnot = dict()
#excludeifnot['gender'] = 'm'
excludeabove = dict()
excludebelow = dict()

excludebelow['inferreddate'] = 1700
excludeabove['inferreddate'] = 1950
futurethreshold = 1950

metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif,
                                   excludeifnot, excludebelow, excludeabove)

# Now that we have a list of volumes with metadata, we can select the groups of IDs
# that we actually intend to contrast. If we want to us more or less everything,
# this may not be necessary. But in some cases we want to use randomly sampled subsets.

# IDsToUse = set([x for x in metadict.keys()])

# The default condition here is

category2sorton = 'reviewed'
positive_class = 'rev'
sizecap = 350
# A sizecap less than one means, no sizecap.

IDsToUse, classdictionary = metafilter.balance_classes(metadict,