Пример #1
0
def half(tokens):
    '''how far do we have to go down the frequency list to account
    for half the words in the sample?'''
    words = list(u.frequencies(tokens).keys())
    frequencies = list(u.frequencies(tokens).values())
    half = len(tokens) // 2
    i = 0
    total = 0
    while total < half:
        total += frequencies[i]
        i += 1
    far = (words[i], i, frequencies[i], total)
    i -= 1
    near = (words[i], i, frequencies[i], total)
    return (near, half, far)
Пример #2
0
def main():
    filenames = ['Gratian0.txt', 'Gratian1.txt', 'Gratian2.txt']
    tokens = []
    for filename in filenames:
        tokens += u.tokenize('./corpus/' + filename)
    frequencies = u.frequencies(tokens)
    stop = 31  # Figure_Zy
    # stop = len(frequencies) # Figure_Zz
    words = []
    occurences = []
    for b in range(1, stop):  # Number of Occurences (b)
        a = 0  # Number of Words (a)
        for value in list(frequencies.values()):
            if value == b: a += 1
        k = a * b * b  # ab^2 = k formulation of Zipf's law
        if a == 0: continue  # log(0) throws ValueError: math domain error
        words.append(a)
        occurences.append(b)
    u.plot_data_scatter(u.logify(zip(words, occurences)))
    slope = u.plot_regression(u.logify(zip(words, occurences)))
    pp.xlabel('Number of Words')
    pp.ylabel('Number of Occurences')
    pp.title('$ab^2 = k$\n(log-log, slope = ' + f'{slope:.4f})')
    pp.savefig('./PNGs/Figure_Zy')
    # pp.savefig('./PNGs/Figure_Zz')
    pp.show()
Пример #3
0
def get_features(texts, n):
    '''
    Assemble a large corpus made up of texts written by an arbitrary
    number of authors; let’s say that number of authors is x.
    '''
    corpus = []
    for text in texts:
        corpus += u.tokenize(path + text + '.txt')
    '''
    Find the n most frequent words in the corpus to use as features.
    '''
    features = list(u.frequencies(corpus).keys())[:n]
    return features
Пример #4
0
def get_frequencies(features, subcorpora):
    '''
    For each of these n features, calculate the share of each of
    the x authors’ subcorpora represented by this feature, as a
    percentage of the total number of words.
    '''
    frequencies = {}
    empty = dict.fromkeys(features, 0)
    for subcorpus in subcorpora:
        frequencies[subcorpus] = empty.copy()
        subcorpus_tokens = u.tokenize(path + subcorpus + '.txt')
        subcorpus_frequencies = u.frequencies(subcorpus_tokens)
        for feature in features:
            frequencies[subcorpus][feature] = (subcorpus_frequencies.get(
                feature, 0) / len(subcorpus_tokens)) * 1000
    return frequencies
Пример #5
0
def main():
    filenames = ['Gratian0.txt', 'Gratian1.txt', 'Gratian2.txt']
    tokens = []
    for filename in filenames:
        tokens += u.tokenize('./corpus/' + filename)
    print(half(tokens))
    tmp = u.rank_frequencies(u.frequencies(tokens))
    actual = list(tmp.values())
    scale = actual[0][1]
    # 30 is a commonly used number in Burrows's articles
    theoretical = u.zipf_distrib(30, scale)
    figure_za(theoretical)
    figure_zb(theoretical)
    figure_zc(dict(itertools.islice(tmp.items(), 30)))
    figure_zd(actual[0:30])
    figure_ze(actual)
Пример #6
0
def add_test_values(test, features, frequencies, z_scores):
    '''
    Then, calculate the same z-scores for each feature in the text
    for which we want to determine authorship.
    '''
    test_tokens = []
    test_tokens = u.tokenize(path + test + '.txt')
    test_frequencies = u.frequencies(test_tokens)
    frequencies[test] = dict.fromkeys(features, 0)
    z_scores[test] = dict.fromkeys(features, 0)
    for feature in features:
        frequencies[test][feature] = (test_frequencies.get(feature, 0) /
                                      len(test_tokens)) * 1000
        z_scores[test][feature] = (
            frequencies[test][feature] -
            frequencies['means'][feature]) / frequencies['stdevs'][feature]
    return (frequencies, z_scores)
Пример #7
0
def main():
    '''
    Assemble a large corpus made up of texts written by an arbitrary
    number of authors; let’s say that number of authors is x.
    '''
    test = 'cases' # only have to change this one line
    authors = ['cases', 'laws', 'marriage', 'other', 'penance', 'second']
    authors.remove(test)
    corpus = []
    for author in authors:
        corpus += u.tokenize('./corpus/' + author + '.txt')
    '''
    Find the n most frequent words in the corpus to use as features.
    '''
    mfws = list(u.frequencies(corpus).keys())[:30]
    '''
    For each of these n features, calculate the share of each of
    the x authors’ subcorpora represented by this feature, as a
    percentage of the total number of words.
    '''
    corp_f_dict = {}
    empty = dict.fromkeys(mfws, 0)
    for author in authors:
        corp_f_dict[author] = empty.copy()
        subcorpus = u.tokenize('./corpus/' + author + '.txt')
        subcorpus_frequencies = u.frequencies(subcorpus)
        for word in mfws:
            corp_f_dict[author][word] = (subcorpus_frequencies.get(word, 0) / len(subcorpus)) * 1000
        u.write_csv(corp_f_dict, './subcorpus_frequencies.csv')
    '''
    Then, calculate the mean and the standard deviation of these x
    values and use them as the offical mean and standard deviation
    for this feature over the whole corpus. In other words, we will
    be using a mean of means instead of calculating a single value
    representing the share of the entire corpus represented by each
    word.
    '''
    means = empty.copy()
    stdevs = empty.copy()
    for word in mfws:
        corp_f_list = []
        for author in authors:
            corp_f_list.append(corp_f_dict[author][word])
        means[word] = statistics.mean(corp_f_list)
        stdevs[word] = statistics.stdev(corp_f_list)
    '''
    For each of the n features and x subcorpora, calculate a z-score
    describing how far away from the corpus norm the usage of this
    particular feature in this particular subcorpus happens to be.
    To do this, subtract the "mean of means" for the feature from
    the feature’s frequency in the subcorpus and divide the result
    by the feature’s standard deviation.
    '''
    corp_z_dict = {}
    for author in authors:
        corp_z_dict[author] = empty.copy()
        for word in mfws:
            corp_z_dict[author][word] = (corp_f_dict[author][word] - means[word]) / stdevs[word]
    '''
    Then, calculate the same z-scores for each feature in the text
    for which we want to determine authorship.
    '''
    test_tokens = []
    test_tokens = u.tokenize('./corpus/' + test + '.txt')
    test_frequencies = u.frequencies(test_tokens)
    test_f_dict = test_z_dict = empty.copy()
    for word in mfws:
       test_f_dict[word] = (test_frequencies.get(word, 0) / len(test_tokens)) * 1000
       # can collapse this into one loop
       test_z_dict[word] = (test_f_dict[word] - means[word]) / stdevs[word]
    print(test_z_dict)
    '''
    Finally, calculate a delta score comparing the anonymous paper
    with each candidate’s subcorpus. To do this, take the average
    of the absolute values of the differences between the z-scores
    for each feature between the anonymous paper and the candidate’s
    subcorpus. (Read that twice!) This gives equal weight to each
    feature, no matter how often the words occur in the texts;
    otherwise, the top 3 or 4 features would overwhelm everything
    else.
    '''
    for author in authors:
        sum = 0
        for word in mfws:
            sum += math.fabs(corp_z_dict[author][word] - test_z_dict[word])
        delta = sum / len(mfws)
        print(test + "-" + author + " delta: " + str(delta))