-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleandata.py
176 lines (140 loc) · 6.76 KB
/
cleandata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
""" A collection of methods for handling data gathered through the
StackExchange API. Includes methods for extracting metric data from
StackExchange JSON archives, and methods for putting that data in to a
MySQL database.
"""
import json
import pymysql
import model
import numpy as np
# List of fields that will occur in the database of cleaned data
fields = ['id', 'authorrep', 'calculus', 'colons', 'commands', 'commas',
'dollars', 'doubledollars', 'effort', 'emoticons', 'homework', 'numtags',
'paragraphs', 'periods', 'pleas', 'politeness', 'postlength', 'precalc',
'questionmarks', 'questions', 'quotes', 'spaces', 'titlelength',
'txtspeak', 'closed']
def extract_data_dict(item):
"""Given item (a dict extracted from StackExchange JSON data), extract
metrics and return a dict of the results.
"""
# Define some word groups to search for as part of data measurement.
demands = ['prove', 'Prove', 'show', 'Show', 'compute', 'Compute',
'calculate', 'Calculate', 'find', 'Find', 'Explain', 'explain']
effort = ['I tried', "I've tried", "My attempt", 'my attempt',
'work so far']
emoticons = [':)', ':-)', ':(', ':-(', ':D', ':-D', ';-)', ';)', '(:', '):',
':$', ':-$']
pleas = ['help', 'Help', "don't understand", "don't get it",
"don't see how", 'show me', 'Show me', 'stuck', 'Stuck']
polite = ['please', 'Please', 'thanks', 'Thanks', 'Thank you', 'thank you']
questions = ['where', 'Where', 'what', 'What', 'when', 'When', 'why',
'Why', 'how', 'How', 'who', 'Who']
txtspeak = [' u ', 'pls', 'Pls', 'Thx', 'thx']
stats = dict()
# Handle the fact that posts are occasionally anonymous by declaring such
# users to have the minimum possible reputation (1)
if 'owner' in item and 'reputation' in item['owner']:
stats['authorrep'] = item['owner']['reputation']
else:
stats['authorrep'] = 1
# If question has been closed, check to see if it is for the desired reason
if 'closed_details' in item:
desc = item['closed_details']['description']
stats['closed'] = int('context' in desc)
else:
stats['closed'] = 0
stats['calculus'] = int('calculus' in item['tags'] or
'multivariable-calculus' in item['tags'])
stats['colons'] = item['body'].count(':')
stats['commands'] = sum([item['body'].count(word) for word in demands])
stats['commas'] = item['body'].count(',')
stats['dollars'] = item['body'].count('$')
stats['doubledollars'] = item['body'].count('$$')
stats['effort'] = sum([item['body'].count(word) for word in effort])
stats['emoticons'] = sum([item['body'].count(word) for word in emoticons])
stats['homework'] = int('homework' in item['tags'])
stats['id'] = item['question_id']
stats['numtags'] = len(item['tags'])
stats['paragraphs'] = item['body'].count('<p>')
stats['periods'] = item['body'].count('.')
stats['pleas'] = sum([item['body'].count(word) for word in pleas])
stats['politeness'] = sum([item['body'].count(word) for word in polite])
stats['postlength'] = len(item['body'])
stats['precalc'] = int('algebra-precalculus' in item['tags'])
stats['questionmarks'] = item['body'].count('?')
stats['questions'] = sum([item['body'].count(word) for word in questions])
stats['quotes'] = item['body'].count('"') + item['body'].count("'")
stats['spaces'] = item['body'].count(' ')
stats['titlelength'] = len(item['title'])
stats['txtspeak'] = sum([item['body'].count(word) for word in txtspeak])
return stats
def extract_data_vector(item, include_closed=False, include_id=False):
"""Given item (a dict extracted from StackExchange JSON data), return
a list of the extracted data, in the order desired by the database.
include_closed: Do you want the closed status of the post?
include_id: Do you want the question ID of the post? (Key for dbase)
"""
stats = extract_data_dict(item)
if include_closed:
end = len(fields)
else:
end = len(fields) - 1
if include_id:
start = 0
else:
start = 1
vec = tuple(stats[field] for field in fields[start:end])
return vec
def add_to_training_data(posts):
""" Given posts (a list of dicts extracted from StackExchange JSON data),
add posts to the training data stored in the database. The model is then
retrained using all available data.
Note: If a post ID is already in the training database, it is updated with
the newly-extracted measurements.
"""
query = "INSERT INTO trainingdata ("
query += ', '.join(fields) + ") VALUES "
datavecs = [str(tuple(extract_data_vector(item, True, True))) for item in posts]
query += ",\n".join(datavecs)
query += " ON DUPLICATE KEY UPDATE "
query += ','.join(["{0}=VALUES({0})".format(field) for field in fields[1:]])
query += ';\n'
f = open('dbase.conf', 'r')
dbase, user, passwd = f.readline().rstrip().split(',')
f.close()
conn = pymysql.connect(user=user, passwd=passwd, db=dbase)
cur = conn.cursor()
count = cur.execute(query)
conn.commit()
print("Successfully merged {} entries!".format(count))
cur.close()
conn.close()
model.build_model()
def update_live_data(posts):
""" Given posts (a list of dicts extracted from StackExchange JSON data),
replace the current live data with the information in posts.
"""
query = '''INSERT INTO livedata (id, postlink, title, body, userid, username, userrep, userlink, userpic, prediction, prob) VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s") '''
predictions = model.predictions(posts)
probabilities = model.probabilities(posts)
queryvals = []
for post, pred, prob in zip(posts, predictions, probabilities):
queryvals.append((post['question_id'], post['link'], post['title'],
post['body'], post['owner']['user_id'], post['owner']['display_name'], post['owner']['reputation'], post['owner']['link'], post['owner']['profile_image'], float(pred), float(prob)))
f = open('dbase.conf', 'r')
dbase, user, passwd = f.readline().rstrip().split(',')
f.close()
conn = pymysql.connect(user=user, passwd=passwd, db=dbase, charset='utf8')
cur = conn.cursor()
cur.execute("DELETE FROM livedata WHERE 1")
count = cur.executemany(query, queryvals)
conn.commit()
print("Successfully merged {} entries!".format(count))
cur.close()
conn.close()
# If the script is called directly, process the file 'rawtrainingdata.json' to
# extract metric information in to database
if __name__ == "__main__":
rawdatafile = open('rawtrainingdata.json', 'r')
rawdata = json.load(rawdatafile)
add_to_training_data(rawdata)