def main(): filenames = ['Gratian0.txt', 'Gratian1.txt', 'Gratian2.txt'] tokens = [] for filename in filenames: tokens += u.tokenize('./corpus/' + filename) frequencies = u.frequencies(tokens) stop = 31 # Figure_Zy # stop = len(frequencies) # Figure_Zz words = [] occurences = [] for b in range(1, stop): # Number of Occurences (b) a = 0 # Number of Words (a) for value in list(frequencies.values()): if value == b: a += 1 k = a * b * b # ab^2 = k formulation of Zipf's law if a == 0: continue # log(0) throws ValueError: math domain error words.append(a) occurences.append(b) u.plot_data_scatter(u.logify(zip(words, occurences))) slope = u.plot_regression(u.logify(zip(words, occurences))) pp.xlabel('Number of Words') pp.ylabel('Number of Occurences') pp.title('$ab^2 = k$\n(log-log, slope = ' + f'{slope:.4f})') pp.savefig('./PNGs/Figure_Zy') # pp.savefig('./PNGs/Figure_Zz') pp.show()
def extract(self, data): tokens = [] for attr in self.list_of_attributes: tokens.extend(tokenize(data[attr], self.ngram_size)) return {key: 1 for key in tokens}
def __init__(self, train_sample, test_sample, ngram_size, progressbar=None, normalization=False): self.idf = Counter() self.ngram_size = ngram_size for _, data, _ in train_sample: tmp_set = set() for token in tokenize(data['title'] + ' ' + data['description'], self.ngram_size): tmp_set.add(token) for token in tmp_set: self.idf[token] += 1 progressbar.update(progressbar.currval + 1) for key in self.idf: self.idf[key] = math.log(float(len(train_sample)) / self.idf[key], 2) """ logging.info('BagOfWordsUnitedTFIDF init done') logging.info('Total number of attributes ' + str(len(self.idf))) logging.info(self.idf) """ self.attributes = dict([(x, 'numeric') for x in self.idf]) self.normalization = normalization if normalization: self.max_value = 0.0 logging.info('Computing max value') for _, data, _ in train_sample + test_sample: result = self.extract(data, norm=False) for _, val in result.iteritems(): self.max_value = max(self.max_value, val) logging.info('max value: ' + str(self.max_value))
def __init__(self, train_sample, at_most=1000): cnt = Counter() for _, data, _ in train_sample: tokens = tokenize(data['title'] + ' ' + data['description']) for x in tokens: cnt[x] += 1 self.attributes = dict([(x, 'numeric') for x in cnt.most_common(at_most)])
def extract(self, data): tokens = tokenize(data['title'] + ' ' + data['description']) res = {} for x in tokens: res[x] = 1 return res
def configure_dictionary(type): list_of_words = tokenize(type) dic = dictionary() for list in list_of_words: for word in list: dic.put(word) dic.rearrange() return dic, list_of_words
def __init__(self, train_sample, ngram_size, list_of_attributes, min_frequency, progressbar): self.count = Counter() self.ngram_size = ngram_size self.list_of_attributes = list_of_attributes for _, data, _ in train_sample: for attr in self.list_of_attributes: for token in tokenize(data[attr], self.ngram_size): self.count[token] += 1 progressbar.update(progressbar.currval + 1) self.attributes = {key: 'numeric' for key, value in self.count.iteritems() if value >= min_frequency}
def correct(sent, model=model, topK=5, threshold=lprob): # print(psutil.virtual_memory()) sent = delNonAlphabetAndEmpty(tokenize(sent)) sentences = mergeOne(sent) sentences = [replaceEntities(sent) for sent in sentences] combi = [list() for i in range(len(sentences))] # print(sentences,sent) for i in range(len(sentences)): for j in range(len(sentences[i])): vocab = spellChecker.spell(sentences[i][j]) if len(vocab) >= topK: combi[i].append(vocab[:topK]) else: combi[i].append(vocab) # print(combi) cP = [cartesianProduct(c) for c in combi] del combi prob = list() for i in range(len(cP)): for j in range(len(cP[i])): prob.append((calculate_sentence_ln_prob(list(cP[i][j]), model), cP[i][j])) del cP prob = [p for p in prob if p[0] > threshold] # print(prob) if prob != []: prob.sort(key=takeFirst, reverse=True) # maxp = prob[:topK] ans = prob[0][1] else: ans = sent prob.append((0, False)) out = [] for i in range(len(ans)): if ans[i] in ['B-TIME', 'B-DATE', 'NUM', "PUNCT"]: out.append(sent[i]) else: out.append(ans[i]) return [(prob[0][0] or float("-inf")), " ".join(out)] # x = input() # start = time.time() # print(correct(x)) # stop = time.time() # print(stop-start)
def get_features(texts, n): ''' Assemble a large corpus made up of texts written by an arbitrary number of authors; let’s say that number of authors is x. ''' corpus = [] for text in texts: corpus += u.tokenize(path + text + '.txt') ''' Find the n most frequent words in the corpus to use as features. ''' features = list(u.frequencies(corpus).keys())[:n] return features
def extract(self, data, norm=True): tokens = tokenize(data['title'] + ' ' + data['description'], self.ngram_size) count = Counter() for x in tokens: count[x] += 1 """ for x in count: count[x] /= float(len(tokens)) count[x] *= self.idf[x] if norm and self.normalization: count[x] /= self.max_value """ return dict(count)
def get_frequencies(features, subcorpora): ''' For each of these n features, calculate the share of each of the x authors’ subcorpora represented by this feature, as a percentage of the total number of words. ''' frequencies = {} empty = dict.fromkeys(features, 0) for subcorpus in subcorpora: frequencies[subcorpus] = empty.copy() subcorpus_tokens = u.tokenize(path + subcorpus + '.txt') subcorpus_frequencies = u.frequencies(subcorpus_tokens) for feature in features: frequencies[subcorpus][feature] = (subcorpus_frequencies.get( feature, 0) / len(subcorpus_tokens)) * 1000 return frequencies
def main(): filenames = ['Gratian0.txt', 'Gratian1.txt', 'Gratian2.txt'] tokens = [] for filename in filenames: tokens += u.tokenize('./corpus/' + filename) print(half(tokens)) tmp = u.rank_frequencies(u.frequencies(tokens)) actual = list(tmp.values()) scale = actual[0][1] # 30 is a commonly used number in Burrows's articles theoretical = u.zipf_distrib(30, scale) figure_za(theoretical) figure_zb(theoretical) figure_zc(dict(itertools.islice(tmp.items(), 30))) figure_zd(actual[0:30]) figure_ze(actual)
def add_test_values(test, features, frequencies, z_scores): ''' Then, calculate the same z-scores for each feature in the text for which we want to determine authorship. ''' test_tokens = [] test_tokens = u.tokenize(path + test + '.txt') test_frequencies = u.frequencies(test_tokens) frequencies[test] = dict.fromkeys(features, 0) z_scores[test] = dict.fromkeys(features, 0) for feature in features: frequencies[test][feature] = (test_frequencies.get(feature, 0) / len(test_tokens)) * 1000 z_scores[test][feature] = ( frequencies[test][feature] - frequencies['means'][feature]) / frequencies['stdevs'][feature] return (frequencies, z_scores)
def main(): ''' Assemble a large corpus made up of texts written by an arbitrary number of authors; let’s say that number of authors is x. ''' test = 'cases' # only have to change this one line authors = ['cases', 'laws', 'marriage', 'other', 'penance', 'second'] authors.remove(test) corpus = [] for author in authors: corpus += u.tokenize('./corpus/' + author + '.txt') ''' Find the n most frequent words in the corpus to use as features. ''' mfws = list(u.frequencies(corpus).keys())[:30] ''' For each of these n features, calculate the share of each of the x authors’ subcorpora represented by this feature, as a percentage of the total number of words. ''' corp_f_dict = {} empty = dict.fromkeys(mfws, 0) for author in authors: corp_f_dict[author] = empty.copy() subcorpus = u.tokenize('./corpus/' + author + '.txt') subcorpus_frequencies = u.frequencies(subcorpus) for word in mfws: corp_f_dict[author][word] = (subcorpus_frequencies.get(word, 0) / len(subcorpus)) * 1000 u.write_csv(corp_f_dict, './subcorpus_frequencies.csv') ''' Then, calculate the mean and the standard deviation of these x values and use them as the offical mean and standard deviation for this feature over the whole corpus. In other words, we will be using a mean of means instead of calculating a single value representing the share of the entire corpus represented by each word. ''' means = empty.copy() stdevs = empty.copy() for word in mfws: corp_f_list = [] for author in authors: corp_f_list.append(corp_f_dict[author][word]) means[word] = statistics.mean(corp_f_list) stdevs[word] = statistics.stdev(corp_f_list) ''' For each of the n features and x subcorpora, calculate a z-score describing how far away from the corpus norm the usage of this particular feature in this particular subcorpus happens to be. To do this, subtract the "mean of means" for the feature from the feature’s frequency in the subcorpus and divide the result by the feature’s standard deviation. ''' corp_z_dict = {} for author in authors: corp_z_dict[author] = empty.copy() for word in mfws: corp_z_dict[author][word] = (corp_f_dict[author][word] - means[word]) / stdevs[word] ''' Then, calculate the same z-scores for each feature in the text for which we want to determine authorship. ''' test_tokens = [] test_tokens = u.tokenize('./corpus/' + test + '.txt') test_frequencies = u.frequencies(test_tokens) test_f_dict = test_z_dict = empty.copy() for word in mfws: test_f_dict[word] = (test_frequencies.get(word, 0) / len(test_tokens)) * 1000 # can collapse this into one loop test_z_dict[word] = (test_f_dict[word] - means[word]) / stdevs[word] print(test_z_dict) ''' Finally, calculate a delta score comparing the anonymous paper with each candidate’s subcorpus. To do this, take the average of the absolute values of the differences between the z-scores for each feature between the anonymous paper and the candidate’s subcorpus. (Read that twice!) This gives equal weight to each feature, no matter how often the words occur in the texts; otherwise, the top 3 or 4 features would overwhelm everything else. ''' for author in authors: sum = 0 for word in mfws: sum += math.fabs(corp_z_dict[author][word] - test_z_dict[word]) delta = sum / len(mfws) print(test + "-" + author + " delta: " + str(delta))