forked from jonnyspicer/jonnyspicer.github.io
/
wordcount.py
110 lines (84 loc) · 3.83 KB
/
wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import re
import csv
import nltk
import string
import regex
import pandas as pd
from bs4 import BeautifulSoup
from textblob import TextBlob
# TODO:
# - strip anything in head/header/footer for old html posts
# - correct the oodles of spelling mistakes this has uncovered
# directory = input('Which directory would you like a word count for?')
wordcount = 0
uniquewords = dict()
wordsperpost = dict()
stems = dict()
nonalphabeticalremover = regex.compile('[^A-Za-z ]')
sentiments = dict()
# huge shoutout to https://stackoverflow.com/questions/25109307/how-can-i-find-all-markdown-links-using-regular-expressions
linkremover = regex.compile(
r"(?|(?<txt>(?<url>(?:ht|f)tps?://\S+(?<=\PP)))|\(([^)]+)\)\[(\g<url>)])", re.MULTILINE)
dirs = ["mendokusai/_posts", "tartarus/_posts"]
for directory in dirs:
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".md") or filename.endswith(".html"):
with open(directory + '/' + filename, 'r') as file:
# removes line endings
text = file.read().replace('\n', ' ')
# selects only the portion of the file after the Jekyll front matter
text = text.split('---', 2)
# replace break and list tags with spaces
text = text[2].replace('<br />', ' ')
text = text.replace('<li>', ' ')
text = BeautifulSoup(text, features="html.parser")
# removes html tags
text = text.get_text()
# removes target=blank Markdown tags
text = text.replace("{:target=\"_blank\"}", '')
# removes Markdown links
text = regex.sub(linkremover, '', text)
# removes anything that isn't an alphabetical character and casts the remaining string to lowercase
text = regex.sub(nonalphabeticalremover, ' ', text).lower()
blob = TextBlob(text)
sentiments.update({ filename: blob.sentiment.polarity})
wordsperpost.update({ filename: len(text.split())})
wordcount += len(text.split())
# nltk stemming/token magic from http://ryancompton.net/2014/06/06/statistical-features-of-infinite-jest/
tokens = nltk.word_tokenize(text)
stemmer = nltk.stem.PorterStemmer()
stemmed_tokens = map(lambda x: stemmer.stem(x), tokens)
for token in stemmed_tokens:
if token in stems:
newVal = stems.get(token) + 1
stems.update({token: newVal})
else:
stems.update({token: 1})
for word in text.split():
if word in uniquewords:
newVal = uniquewords.get(word) + 1
uniquewords.update({word: newVal})
else:
uniquewords.update({word: 1})
continue
if wordcount < 1:
print('That directory doesn\'t appear to have any posts in!')
else:
print('Wordcount: ' + str(wordcount))
print('Unique words: ' + str(len(uniquewords)))
print('Unique stems: ' + str(len(stems)))
# sorting dictionaries is unnecessarily difficult in python
sortedtuples = sorted(uniquewords.items(), reverse=True, key=lambda x: x[1])
sortedwords = dict()
for tuple in sortedtuples:
sortedwords.update({tuple[0]: tuple[1]})
pd.DataFrame.from_dict(data=sortedwords, orient='index').to_csv(
'words.csv', header=False)
pd.DataFrame.from_dict(data=stems, orient='index').to_csv(
'stems.csv', header=False)
pd.DataFrame.from_dict(data=sentiments, orient='index').to_csv(
'sentiments.csv', header=False)
pd.DataFrame.from_dict(data=wordsperpost, orient='index').to_csv(
'wordsperpost.csv', header=False)