/
record_toxicity.py
78 lines (60 loc) · 1.84 KB
/
record_toxicity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import language
import file_io
import time
import json
import sys
if len(sys.argv) < 3:
print("Please include parameter file and API file")
param_file = sys.argv[1]
key_file = sys.argv[2]
program_start = int(time.time())
subreddits = file_io.read_parameters(param_file)[1]
k, src, dst, ext = file_io.read_parameters(param_file)[5:]
k = k*1000
language.set_API_key(key_file)
for sub in subreddits:
sub_data = []
input_file = src + '/' + sub.lower() + '_' + src + '.json'
comment_data = file_io.read(input_file)["data"]
output_file = dst + '/' + sub.lower() + '_' + dst + '.json'
file_io.set_output_file(output_file)
c = 0 #number of api calls
d = 0 #number of succesful api calls
st = 0
end = len(comment_data)
if ext:
old_data = file_io.read(output_file)
sub_data = old_data['data']
last_record = sub_data[-1]
for j in range(k):
if comment_data[j]['permalink'] == last_record['permalink']:
st = j + 1
t1 = int(time.time())
for i in range(st,end):
datum = comment_data[i]
comment = datum['body']
if comment == '[removed]' or not language.is_english(comment):
continue
score = language.get_toxicity(comment, 'SEVERE_TOXICITY')
c += 1
if c == 600:
t2 = int(time.time())
t = t2 - t1
if t < 60:
time.sleep(60 - t)
t1 = int(time.time())
c = 0
if score < 0:
continue
datum['toxicity'] = score
sub_data.append(datum)
d += 1
if d % 10000 == 0:
data = {'data': sub_data}
file_io.write(data)
if d == k:
break
data = {'data': sub_data}
file_io.write(data)
program_end = int(time.time())
print(program_end - program_start)