-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict_botness.py
222 lines (176 loc) · 7.32 KB
/
predict_botness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import praw, pickle, time, string, text_tools
import numpy as np
from praw.models import MoreComments
from cfg import config
from sys import argv
from progress.bar import IncrementalBar
# We're going to need to log in to reddit. To do so, we'll need a number of text
# arguments that I just store in a private config file, which contains a class with
# attributes which store the things we'll need.
cfg = config()
def get_user_comments(username, reddit, verbose = True):
#Given a username, get the comments that user has written. Return them as a list
comments = []
# There are a number of ways this can go wrong, and as Reddit changes the way the
# website works, there are bound to be some that pop up which haven't been handled
# here. This should ensure such situations are handled smoothly.
try:
# First get ALL comments by the user.
user = reddit.redditor(username)
for c in user.comments.new(limit=None):
# Then we want to remove any links the user posted in the comment,
# as well as any text they're quoting from another user. clean_comment does
# this.
cc = text_tools.clean_comment(c.body)
# If the remaining comment is empty, don't do anything, otherwise, keep it.
if len(cc) > 0:
comments += [cc]
# If the user interrupts, just move to the next user.
except KeyboardInterrupt:
pass
# If the comment gathering fails for any other reason, print out the user it failed
# on, as well as the reason. This facilitates debugging.
except Exception as e:
if verbose:
print("Barfed on", username)
print(e)
# Return all the comments as a list.
return comments
def get_new_users(usernames):
# Once this bot has seen a user, it doesn't have to analyze their comments again, it
# already knows its prediction. If you want to analyze a lot of users at once
# (for example all the users in a thread), you can save a lot of time by only analyzing
# users the bot hasn't seen yet. This makes that possible.
#
# Note: potential_bots.txt and potential_not_bots.txt are just text files that log
# the results of previous analyses. See the end of the script to see how they're written.
with open("potential_bots.txt", "r") as f:
already_analyzed_users = [x for x in f.read().split("\n") if len(x) and\
not x.startswith("#")]
with open("potential_not_bots.txt", "r") as f:
already_analyzed_users += [x for x in f.read().split("\n") if len(x) and\
not x.startswith("#")]
return [x for x in usernames if not x in already_analyzed_users]
def predict_botness(username, reddit, clf, vectorizer):
# This is the part that actually classifies a given user as a bot or not a bot given a
# previously-trained classifier. This classifier (and vectorizer) are made by
# model_reddit_comments.py.
user_comments = get_user_comments(username, reddit)
new_user_corpus = " ".join(user_comments)
n_words = len(new_user_corpus.split())
# The classifier does not work well for users who have written fewer than 1000 words,
# so if that describes the current user, do not bother trying to classify them.
if n_words < 1000:
return None
# Otherwise, convert the corpus to a vector and use it to classify the user.
else:
features = text_tools.get_text_vectors([new_user_corpus], vectorizer)
return clf.predict(features)[0] == 1
def get_usernames_from_subreddit(subname):
# One way to use this is to just scrape an entire subreddit. Obviously, you have to stop somewhere, so this takes the users who comment on the current top 10 posts in that sub.
# Some subreddits are "quarantined", which means you have to explicitly consent to
# entering the subreddit. This try statement does that.
try:
subreddit = reddit.subreddit(subname)
submissions = subreddit.top("day", limit=10)
except TypeError:
subreddit = reddit.subreddit(subname)
subreddit.quaran.opt_in()
submissions = subreddit.top("day", limit=10)
names = []
for s in submissions:
# For each post in the top 10, get all the users who have commented on it
names += [c.author.name for c in s.comments.list() if not type(c) == MoreComments\
and not c.body.startswith("[") and not c.body == "[deleted]" and not\
c.author == None and not c.author.name.startswith("Unavai") ]
# Return those users
return names
if __name__ == '__main__':
#Specify the pkl file that contains the classifier and vectorizer
modelfile = "final_classifier_vectorizer.pkl"
#Don't analyze other bots
dont_analyze = ["AutoModerator", "autotldr"]
#Log on to the reddit API
reddit = praw.Reddit(client_id = cfg.client_id, username = cfg.username,\
password = cfg.password, client_secret = cfg.secret, user_agent = cfg.agent)
#Get user args
subsearch = False
for i, arg in enumerate(argv):
#The user can specify a list of usernames
if arg == "-u":
usernames = set(argv[i+1:])
#Or a file that contains a list of usernames
elif arg == "-f":
with open(argv[i+1], "r") as f:
usernames = [x for x in f.read().split("\n") if len(x) and\
not x.startswith("#")][0].split()
#Or to look at the top 10 posts of the day from a given subreddit
elif arg == "-sub":
subsearch = True
subname = argv[i+1]
usernames = get_usernames_from_subreddit(subname)
#User can also specify a different classifier and vectorizer pkl file
elif arg == "-m":
modelfile = argv[i+1]
#Load the classifier and vectorizer
with open(modelfile, "rb") as f:
clf, vectorizer = pickle.load(f)
#Filter out all users that have already been classified
usernames = get_new_users(usernames)
bots = []
not_bots = []
insufficient_comments = []
#Print text to keep track of how long the process is taking and
#how much time it has left.
start_str = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
print()
if subsearch:
print("/r/" + subname + " search started at", start_str)
bar = IncrementalBar("Analyzing Users", max = len(usernames), suffix = '%(percent)d%% [%(elapsed_td)s / %(eta_td)s]')
#Go through each user and classify them
for username in set(usernames):
#Ignore friendly bots
if not username in dont_analyze:
#Classify the user
botness = predict_botness(username, reddit, clf, vectorizer)
#If the user hasn't commented enough to make a reliable prediction,
#ignore the user and keep track of why
if botness == None:
insufficient_comments.append(username)
#Otherwise, keep track of whether they're a bot or not
elif botness:
bots.append(username)
elif not botness:
not_bots.append(username)
else:
pass
#Update the progress bar
if subsearch:
bar.next()
if subsearch:
bar.finish()
#Print out the results of the search. Also save results in the corresponding
#text file.
if len(bots):
print()
print("The model predicts the following names to BE BOTS:")
for b in bots:
print(b)
with open("potential_bots.txt", "a+") as f:
f.write("\n#Search at " + start_str + "\n")
for b in bots:
f.write(b + "\n")
if len(not_bots):
print()
print("The model predicts the following names to NOT BE BOTS:")
for nb in not_bots:
print(nb)
with open("potential_not_bots.txt", "a+") as f:
f.write("\n#Search at " + start_str + "\n")
for nb in not_bots:
f.write(nb + "\n")
if len(insufficient_comments):
print()
print("The following users have not written enough for the model to predict their botness (min 1000 words written):")
for ic in insufficient_comments:
print(ic)