forked from mozilla-services/GitHub-Audit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
term_search.py
executable file
·250 lines (209 loc) · 7.09 KB
/
term_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python3
"""
Search for a term in the code of an org or repo, display any hits
"""
import argparse
import copy
import json
import logging
import time
import urllib.parse
from agithub.GitHub import GitHub
help_epilog = """
Uses GitHub's search to find candidate repos, then searches for all current
matches.
"""
DEBUG = False
CREDENTIALS_FILE = ".credentials"
class AG_Exception(Exception):
pass
# agithub utility functions
def ag_call(
func, *args, expected_rc=None, new_only=True, headers=None, no_cache=False, **kwargs
):
"""
Wrap AGitHub calls with basic error detection
Not smart, and hides any error information from caller.
But very convenient. :)
"""
def query_string():
return urllib.parse.quote_plus(kwargs["q"])
if not headers:
headers = {}
url = func.keywords["url"]
# Insert our (possibly modified) headers
real_headers = kwargs.setdefault("headers", {})
real_headers.update(headers)
if expected_rc is None:
expected_rc = [200, 304]
rc, body = func(*args, **kwargs)
# If we have new information, we want to use it (and store it unless
# no_cache is true)
# If we are told our existing info is ok, or there's an error, use the
# stored info
# Handle repo rename/removal corner cases
if rc == 301:
logger.error("Permanent Redirect for '{}'".format(url))
# TODO: do something better, like switch to using id's
# for now, act like nothing is there
body = []
elif rc == 403 and rc not in expected_rc:
# don't throw on this one, but do show query string
# for search, there is a seperate rate limit we don't yet take into
# account:
# https://developer.github.com/v3/search/#rate-limit
logger.error("403 for query string '{}'".format(query_string()))
logger.error("response: '{}'".format(repr(body)))
expected_rc.append(rc)
elif rc == 404 and rc not in expected_rc:
logger.error("No longer available or access denied: {}".format(url))
# TODO: Figure out what to do here. Maybe it's just that message, but
# maybe need to delete from DB before next run
body = []
# don't throw on this one
expected_rc.append(rc)
logger.debug("{} for {}".format(rc, url))
if rc not in expected_rc:
if DEBUG:
import pudb
pudb.set_trace() # noqa: E702
else:
logger.error("{} for {}".format(rc, url))
raise AG_Exception
return body
def ag_get_all(func, *orig_args, **orig_kwargs):
"""
Generator for multi-page GitHub responses
It hacks the "page" query parameter to each call to get the next page. This
is Not a general solution - it does not follow the links in the headers
like a good client should.
"""
kwargs = copy.deepcopy(orig_kwargs)
args = copy.deepcopy(orig_args)
kwargs["page"] = 1
while True:
body = ag_call(func, *args, **kwargs)
# search results are ugly
if isinstance(body, dict) and "items" in body and len(body["items"]) == 0:
break
elif not isinstance(body, list):
yield body
elif len(body) >= 1:
for elem in body:
yield elem
else:
break
# fix up to get next page, without changing query set
kwargs["page"] += 1
kwargs["new_only"] = False
# JSON support routines
class BytesEncoder(json.JSONEncoder):
# When reading from the database, an empty value will sometimes be returned
# as an empty bytes array. Convert to empty string.
def default(self, obj):
if isinstance(obj, bytes):
if not bool(obj):
return ""
return self.super(obj)
def get_github_client():
def get_token():
token = ""
with open(CREDENTIALS_FILE, "r") as cf:
cf.readline() # skip first line
token = cf.readline().strip()
return token
token = get_token()
# gh = github3.login(token=token)
gh = GitHub(token=token)
gh.generateAuthHeader()
return gh
def ratelimit_dict():
# return gh.ratelimit_remaining
body = ag_call(gh.rate_limit.get, no_cache=True)
return body
def ratelimit_remaining():
body = ratelimit_dict()
return body["resources"]["core"]["remaining"]
def wait_for_ratelimit(min_karma=25, msg=None):
while gh:
payload = ag_call(gh.rate_limit.get, no_cache=True)
if payload["resources"]["core"]["remaining"] < min_karma:
core = payload["resources"]["core"]
now = time.time()
nap = max(core["reset"] - now, 0.1)
logger.info("napping for %s seconds", nap)
if msg:
logger.info(msg)
time.sleep(nap)
else:
break
logger = logging.getLogger(__name__)
Pseudo_code = """
for all orgs
search for term
for all repos in result
clone repo (shallow)
grep repo
"""
# SHH, globals, don't tell anyone
gh = None
def matching_repos(scope, term):
"""
Generator for repositories containing term
"""
q = term + " in:file"
if "/" in scope:
q += " repo:{}".format(scope)
else:
q += " user:{}".format(scope)
kwargs = {"q": q}
found_repos = set()
for body in ag_get_all(gh.search.code.get, **kwargs):
if "items" not in body:
# 403 or something we don't expect
logger.error("Unexpected keys: {}".format(" ".join(body.keys())))
break
logger.debug("items in body: {}".format(len(body["items"])))
for match in body["items"]:
repo = match["repository"]["full_name"]
if repo not in found_repos:
found_repos.add(repo)
yield repo
else:
logger.debug("another hit for {}".format(repo))
def main(driver=None):
args = parse_args()
global gh
gh = get_github_client()
wait_for_ratelimit()
body = ag_call(gh.user.get)
collected_as = body["login"]
logger.info(
"Running as {} ({} API calls remaining)".format(
collected_as, ratelimit_remaining()
)
)
for scope in args.scopes:
logger.info("Starting on {}".format(scope))
for repo in matching_repos(scope, args.term):
print(repo)
logger.info("Done with {} API calls remaining".format(ratelimit_remaining()))
def parse_args():
parser = argparse.ArgumentParser(description=__doc__, epilog=help_epilog)
parser.add_argument("--term", help="Term to search for", required=True)
parser.add_argument("scopes", help="User or User/Repo", default=[], nargs="+")
parser.add_argument("--debug", help="Enter pdb on problem", action="store_true")
args = parser.parse_args()
global DEBUG
DEBUG = args.debug
if DEBUG:
logger.setLevel(logging.DEBUG)
return args
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s"
)
try:
main()
except KeyboardInterrupt:
raise SystemExit