forked from Charcoal-SE/SmokeDetector
/
bodyfetcher.py
134 lines (117 loc) · 5.38 KB
/
bodyfetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from spamhandling import handle_spam, check_if_spam
from globalvars import GlobalVars
import json
import time
import requests
from gibberish import classify_gibberish
class BodyFetcher:
queue = {}
specialCases = {"stackoverflow.com": 5,
"serverfault.com": 5,
"superuser.com": 5,
"math.stackexchange.com": 5,
"drupal.stackexchange.com": 1,
"meta.stackexchange.com": 1}
threshold = 2
def add_to_queue(self, post):
# return # Disabled, see http://chat.stackexchange.com/transcript/message/20369565#20369565
d = json.loads(json.loads(post)["data"])
sitebase = d["siteBaseHostAddress"]
postid = d["id"]
if sitebase in self.queue:
self.queue[sitebase].append(postid)
else:
self.queue[sitebase] = [postid]
print self.queue
self.check_queue()
return
def check_queue(self):
for site, values in self.queue.iteritems():
if site in self.specialCases:
if len(self.queue[site]) >= self.specialCases[site]:
print "site " + site + " met special case quota, fetching..."
self.make_api_call_for_site(site)
return
# if we don't have any sites with their queue filled, take the first one without a special case
for site, values in self.queue.iteritems():
if site not in self.specialCases and len(values) >= self.threshold:
self.make_api_call_for_site(site)
return
def print_queue(self):
string = ""
for site, values in self.queue.iteritems():
string = string + "\n" + site + ": " + str(len(values))
return string
def make_api_call_for_site(self, site):
posts = self.queue.pop(site)
url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
# wait to make sure API has/updates post data
time.sleep(60)
try:
response = requests.get(url, timeout=20).json()
except requests.exceptions.Timeout:
return # could add some retrying logic here, but eh.
if "quota_remaining" in response:
GlobalVars.apiquota = response["quota_remaining"]
else:
GlobalVars.apiquota = 0
return
for post in response["items"]:
if "title" not in post or "body" not in post:
continue
title = GlobalVars.parser.unescape(post["title"])
body = GlobalVars.parser.unescape(post["body"])
link = post["link"]
try:
owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
owner_link = post["owner"]["link"]
owner_rep = post["owner"]["reputation"]
except:
owner_name = ""
owner_link = ""
owner_rep = 0
q_id = str(post["question_id"])
is_spam, reason = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
if owner_rep <= 50 and is_spam:
try:
handle_spam(title, owner_name, site, link, owner_link, q_id, reason, False)
except:
print "NOP"
classified, gibberish_score = classify_gibberish(body, site)
if classified and gibberish_score >= 65:
GlobalVars.bayesian_testroom.send_message(
"[ SmokeDetector | GibberishClassifierBeta ] "
"Potential gibberish body (%s%%): [%s](%s) on `%s`"
% (gibberish_score, title, link, site)
)
try:
for answer in post["answers"]:
answer_title = ""
body = answer["body"]
print "got answer from owner with name " + owner_name
link = answer["link"]
a_id = str(answer["answer_id"])
try:
owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
owner_link = answer["owner"]["link"]
owner_rep = answer["owner"]["reputation"]
except:
owner_name = ""
owner_link = ""
owner_rep = 0
is_spam, reason = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False)
if owner_rep <= 50 and is_spam:
try:
handle_spam(title, owner_name, site, link, owner_link, a_id, reason, True)
except:
print "NOP"
classified, gibberish_score = classify_gibberish(body, site)
if classified and gibberish_score >= 65:
GlobalVars.bayesian_testroom.send_message(
"[ SmokeDetector | GibberishClassifierBeta ] "
"Potential gibberish answer (%s%%): [%s](%s) on `%s`"
% (gibberish_score, title, link, site)
)
except:
print "no answers"
return