forked from Charcoal-SE/SmokeDetector
/
bodyfetcher.py
110 lines (95 loc) · 4.26 KB
/
bodyfetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from spamhandling import handle_spam, check_if_spam
from globalvars import GlobalVars
import json
import time
import requests
class BodyFetcher:
queue = {}
specialCases = {"stackoverflow.com": 5,
"serverfault.com": 5,
"superuser.com": 5,
"drupal.stackexchange.com": 1,
"meta.stackexchange.com": 1}
threshold = 2
def add_to_queue(self, post):
# return # Disabled, see http://chat.stackexchange.com/transcript/message/20369565#20369565
d = json.loads(json.loads(post)["data"])
sitebase = d["siteBaseHostAddress"]
postid = d["id"]
if sitebase in self.queue:
self.queue[sitebase].append(postid)
else:
self.queue[sitebase] = [postid]
print self.queue
self.check_queue()
return
def check_queue(self):
for site, values in self.queue.iteritems():
if site in self.specialCases:
if len(self.queue[site]) >= self.specialCases[site]:
print "site " + site + " met special case quota, fetching..."
self.make_api_call_for_site(site)
return
# if we don't have any sites with their queue filled, take the first one without a special case
for site, values in self.queue.iteritems():
if site not in self.specialCases and len(values) >= self.threshold:
self.make_api_call_for_site(site)
return
def print_queue(self):
string = ""
for site, values in self.queue.iteritems():
string = string + "\n" + site + ": " + str(len(values))
return string
def make_api_call_for_site(self, site):
posts = self.queue.pop(site)
url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
# wait to make sure API has/updates post data
time.sleep(60)
try:
response = requests.get(url, timeout=20).json()
except requests.exceptions.Timeout:
return # could add some retrying logic here, but eh.
GlobalVars.apiquota = response["quota_remaining"]
for post in response["items"]:
title = GlobalVars.parser.unescape(post["title"])
body = GlobalVars.parser.unescape(post["body"])
link = post["link"]
try:
owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
owner_link = post["owner"]["link"]
owner_rep = post["owner"]["reputation"]
except:
owner_name = ""
owner_link = ""
owner_rep = 0
q_id = str(post["question_id"])
is_spam, reason = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
if owner_rep <= 50 and is_spam:
try:
handle_spam(title, owner_name, site, link, owner_link, q_id, reason, False)
except:
print "NOP"
try:
for answer in post["answers"]:
answer_title = ""
body = answer["body"]
print "got answer from owner with name " + owner_name
link = answer["link"]
a_id = str(answer["answer_id"])
try:
owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
owner_link = answer["owner"]["link"]
owner_rep = answer["owner"]["reputation"]
except:
owner_name = ""
owner_link = ""
owner_rep = 0
is_spam, reason = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False)
if owner_rep <= 50 and is_spam:
try:
handle_spam(title, owner_name, site, link, owner_link, a_id, reason, True)
except:
print "NOP"
except:
print "no answers"
return