-
Notifications
You must be signed in to change notification settings - Fork 1
/
parser.py
executable file
·164 lines (135 loc) · 5.5 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/local/bin/python
import re
from os import listdir
from os.path import isfile, join
import yaml
import string
import utils
from utils import punctuation, brackets
totalWords = {}
def parseFile(filename, d, responses):
# stores word occurrences for file in the giant word map
badNames = set()
prevWords = {} # people to word set
f = open(filename, 'r')
for line in f:
(who, what) = parseLine(line)
if who != '':
name = utils.getCanonicalName(who)
if name is None:
if who not in ['Topic', 'Mode']:
badNames.add(who)
else:
tokens = utils.tokenizeLine(what)
storeTokens(name, tokens, d)
storeResponses(name, set(tokens), prevWords, responses)
prevWords[name] = set(tokens) # TODO: do more filtering here instead of in storeResponses; move to helper util function
for word in set(tokens):
if not word in totalWords:
totalWords[word] = 0
totalWords[word] += 1
print badNames
def storeTokens(who, tokens, d):
if not who in d:
d[who] = {}
pDict = d[who]
for i in range(len(tokens)-1): # first to second-to-last word
cword = tokens[i]
nword = tokens[i+1]
if not cword in pDict:
pDict[cword] = {}
if not nword in pDict[cword]:
pDict[cword][nword] = 0
pDict[cword][nword] += 1
# responses TODO:
# remove line once generated responses? otherwise if merle says 'shit' once and then fin says 'dothis' a bunch of times in a row, strong relation.
# generate correlation clusters out of raw response data, use those?
# response score affected by 'cogency' of response? e.g. 'hay kurr' happens a lot.
# raw 'respond to yana saying dryfuss' data? raw 'respond with this word to yana'?
# coherence: sum(frequencies)*product(fractions)
# or sum(totals)+product(fractions)?
def storeResponses(name, tokens, prevWords, responses):
# responses is person talking to previous person talking to words said previously to word to say to frequency
if not name in responses:
responses[name] = {}
for person in prevWords:
# if person != name and name != 'diceb0t': # TODO: remove?
if person not in responses[name]:
responses[name][person] = {}
for word in prevWords[person]:
if all(c in string.letters+'-' for c in word):
if word not in responses[name][person]:
responses[name][person][word] = {}
for cword in tokens:
if all(c in string.letters+'-' for c in cword):
if cword not in responses[name][person][word]:
responses[name][person][word][cword] = 0
responses[name][person][word][cword] +=1
def parseLine(line):
# parses line of log, returns the speaker and what they said.
# returns pair of empty strings if it can't parse.
# TODO: more flexible toward log types
# TODO: /me utterances
m = re.match(r'[0-9]+:[0-9]+ (\S+?): (.*)', line)
if m:
who = m.group(1)
what = m.group(2)
return (who, what)
else:
return ('','')
def filter(frequencies):
filtered = False
for person in frequencies:
badWords = set()
for cword in frequencies[person]:
if len(frequencies[person][cword]) <= 1:
for nword in frequencies[person][cword]: # should be one
if frequencies[person][cword][nword] <= 1:
filtered = True
badWords.add(cword)
# print badWords, person
for bw in badWords:
frequencies[person].pop(bw, None)
for bw in badWords:
for cword in frequencies[person]:
if bw in frequencies[person][cword]:
frequencies[person][cword].pop(bw, None)
return filtered
def filterResponses(responses):
# responses is person talking to previous person talking to words said previously to word to say to frequency
for name in responses:
for person in responses[name]:
emptywords = []
for word in responses[name][person]:
if totalWords[word] > 1000:
emptywords.append(word)
badwords = []
for cword in responses[name][person][word]:
if responses[name][person][word][cword] <= 2 or totalWords[cword] > 1000:
badwords.append(cword)
for bw in badwords:
responses[name][person][word].pop(bw, None)
if responses[name][person][word] == {}:
emptywords.append(word)
for ew in emptywords:
responses[name][person].pop(ew, None)
# main
tokenDict = {} # maps people to words/tokens to next token to frequency
responseDict = {} # maps person talking to previous person talking to words said previously to word to say to frequency
path = '/Users/yanamal/Documents/LimeChat Transcripts/#mage'
logfiles = [ join(path,f) for f in listdir(path) if isfile(join(path,f)) ]
for file in logfiles:
parseFile(file, tokenDict, responseDict)
filterResponses(responseDict)
print responseDict['fin']['yana']
while filter(tokenDict):
pass
# TODO: filters:
# for words with >2 of same letter in a word - shorten to 1? 2? find similar and record? tokens include, e.g. 'a.' which means 'several as'?
# TODO: output histograms for total word frequence, relationship frequency #s (e.g. 3000 pairs of words are seen together 3 times)
rfile = open('responses.yaml', 'w')
yaml.dump(responseDict, rfile) # TODO: explore manual yaml output for more human-readable file? but then will input have to be manual too?
rfile.close()
freqfile = open('frequencies.yaml', 'w')
yaml.dump(tokenDict, freqfile) # TODO: explore manual yaml output for more human-readable file? but then will input have to be manual too?
freqfile.close()